ocfs2: Make ocfs2_extent_tree the first-class representation of a tree.
[safe/jmp/linux-2.6] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "aops.h"
39 #include "dlmglue.h"
40 #include "extent_map.h"
41 #include "inode.h"
42 #include "journal.h"
43 #include "localalloc.h"
44 #include "suballoc.h"
45 #include "sysfile.h"
46 #include "file.h"
47 #include "super.h"
48 #include "uptodate.h"
49
50 #include "buffer_head_io.h"
51
52
53 struct ocfs2_extent_tree_operations {
54         void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
55                                    u64 blkno);
56         u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
57         void (*eo_update_clusters)(struct inode *inode,
58                                    struct ocfs2_extent_tree *et,
59                                    u32 new_clusters);
60         int (*eo_insert_check)(struct inode *inode,
61                                struct ocfs2_extent_tree *et,
62                                struct ocfs2_extent_rec *rec);
63         int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
64
65         /* These are internal to ocfs2_extent_tree and don't have
66          * accessor functions */
67         void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
68         void (*eo_fill_max_leaf_clusters)(struct inode *inode,
69                                           struct ocfs2_extent_tree *et);
70 };
71
72
73 /*
74  * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
75  * in the methods.
76  */
77 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
78 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
79                                          u64 blkno);
80 static void ocfs2_dinode_update_clusters(struct inode *inode,
81                                          struct ocfs2_extent_tree *et,
82                                          u32 clusters);
83 static int ocfs2_dinode_insert_check(struct inode *inode,
84                                      struct ocfs2_extent_tree *et,
85                                      struct ocfs2_extent_rec *rec);
86 static int ocfs2_dinode_sanity_check(struct inode *inode,
87                                      struct ocfs2_extent_tree *et);
88 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
89 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
90         .eo_set_last_eb_blk     = ocfs2_dinode_set_last_eb_blk,
91         .eo_get_last_eb_blk     = ocfs2_dinode_get_last_eb_blk,
92         .eo_update_clusters     = ocfs2_dinode_update_clusters,
93         .eo_insert_check        = ocfs2_dinode_insert_check,
94         .eo_sanity_check        = ocfs2_dinode_sanity_check,
95         .eo_fill_root_el        = ocfs2_dinode_fill_root_el,
96 };
97
98 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
99                                          u64 blkno)
100 {
101         struct ocfs2_dinode *di = et->et_object;
102
103         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
104         di->i_last_eb_blk = cpu_to_le64(blkno);
105 }
106
107 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
108 {
109         struct ocfs2_dinode *di = et->et_object;
110
111         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
112         return le64_to_cpu(di->i_last_eb_blk);
113 }
114
115 static void ocfs2_dinode_update_clusters(struct inode *inode,
116                                          struct ocfs2_extent_tree *et,
117                                          u32 clusters)
118 {
119         struct ocfs2_dinode *di = et->et_object;
120
121         le32_add_cpu(&di->i_clusters, clusters);
122         spin_lock(&OCFS2_I(inode)->ip_lock);
123         OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
124         spin_unlock(&OCFS2_I(inode)->ip_lock);
125 }
126
127 static int ocfs2_dinode_insert_check(struct inode *inode,
128                                      struct ocfs2_extent_tree *et,
129                                      struct ocfs2_extent_rec *rec)
130 {
131         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
132
133         BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
134         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
135                         (OCFS2_I(inode)->ip_clusters != rec->e_cpos),
136                         "Device %s, asking for sparse allocation: inode %llu, "
137                         "cpos %u, clusters %u\n",
138                         osb->dev_str,
139                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
140                         rec->e_cpos,
141                         OCFS2_I(inode)->ip_clusters);
142
143         return 0;
144 }
145
146 static int ocfs2_dinode_sanity_check(struct inode *inode,
147                                      struct ocfs2_extent_tree *et)
148 {
149         int ret = 0;
150         struct ocfs2_dinode *di;
151
152         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
153
154         di = et->et_object;
155         if (!OCFS2_IS_VALID_DINODE(di)) {
156                 ret = -EIO;
157                 ocfs2_error(inode->i_sb,
158                         "Inode %llu has invalid path root",
159                         (unsigned long long)OCFS2_I(inode)->ip_blkno);
160         }
161
162         return ret;
163 }
164
165 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
166 {
167         struct ocfs2_dinode *di = et->et_object;
168
169         et->et_root_el = &di->id2.i_list;
170 }
171
172
173 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
174 {
175         struct ocfs2_xattr_value_root *xv = et->et_object;
176
177         et->et_root_el = &xv->xr_list;
178 }
179
180 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
181                                               u64 blkno)
182 {
183         struct ocfs2_xattr_value_root *xv =
184                 (struct ocfs2_xattr_value_root *)et->et_object;
185
186         xv->xr_last_eb_blk = cpu_to_le64(blkno);
187 }
188
189 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
190 {
191         struct ocfs2_xattr_value_root *xv =
192                 (struct ocfs2_xattr_value_root *) et->et_object;
193
194         return le64_to_cpu(xv->xr_last_eb_blk);
195 }
196
197 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
198                                               struct ocfs2_extent_tree *et,
199                                               u32 clusters)
200 {
201         struct ocfs2_xattr_value_root *xv =
202                 (struct ocfs2_xattr_value_root *)et->et_object;
203
204         le32_add_cpu(&xv->xr_clusters, clusters);
205 }
206
207 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
208         .eo_set_last_eb_blk     = ocfs2_xattr_value_set_last_eb_blk,
209         .eo_get_last_eb_blk     = ocfs2_xattr_value_get_last_eb_blk,
210         .eo_update_clusters     = ocfs2_xattr_value_update_clusters,
211         .eo_fill_root_el        = ocfs2_xattr_value_fill_root_el,
212 };
213
214 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
215 {
216         struct ocfs2_xattr_block *xb = et->et_object;
217
218         et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
219 }
220
221 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
222                                                     struct ocfs2_extent_tree *et)
223 {
224         et->et_max_leaf_clusters =
225                 ocfs2_clusters_for_bytes(inode->i_sb,
226                                          OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
227 }
228
229 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
230                                              u64 blkno)
231 {
232         struct ocfs2_xattr_block *xb = et->et_object;
233         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
234
235         xt->xt_last_eb_blk = cpu_to_le64(blkno);
236 }
237
238 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
239 {
240         struct ocfs2_xattr_block *xb = et->et_object;
241         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
242
243         return le64_to_cpu(xt->xt_last_eb_blk);
244 }
245
246 static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
247                                              struct ocfs2_extent_tree *et,
248                                              u32 clusters)
249 {
250         struct ocfs2_xattr_block *xb = et->et_object;
251
252         le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
253 }
254
255 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
256         .eo_set_last_eb_blk     = ocfs2_xattr_tree_set_last_eb_blk,
257         .eo_get_last_eb_blk     = ocfs2_xattr_tree_get_last_eb_blk,
258         .eo_update_clusters     = ocfs2_xattr_tree_update_clusters,
259         .eo_fill_root_el        = ocfs2_xattr_tree_fill_root_el,
260         .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
261 };
262
263 static void __ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
264                                     struct inode *inode,
265                                     struct buffer_head *bh,
266                                     void *obj,
267                                     struct ocfs2_extent_tree_operations *ops)
268 {
269         et->et_ops = ops;
270         get_bh(bh);
271         et->et_root_bh = bh;
272         if (!obj)
273                 obj = (void *)bh->b_data;
274         et->et_object = obj;
275
276         et->et_ops->eo_fill_root_el(et);
277         if (!et->et_ops->eo_fill_max_leaf_clusters)
278                 et->et_max_leaf_clusters = 0;
279         else
280                 et->et_ops->eo_fill_max_leaf_clusters(inode, et);
281 }
282
283 void ocfs2_get_dinode_extent_tree(struct ocfs2_extent_tree *et,
284                                   struct inode *inode,
285                                   struct buffer_head *bh)
286 {
287         __ocfs2_get_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
288 }
289
290 void ocfs2_get_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
291                                       struct inode *inode,
292                                       struct buffer_head *bh)
293 {
294         __ocfs2_get_extent_tree(et, inode, bh, NULL,
295                                 &ocfs2_xattr_tree_et_ops);
296 }
297
298 void ocfs2_get_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
299                                        struct inode *inode,
300                                        struct buffer_head *bh,
301                                        struct ocfs2_xattr_value_root *xv)
302 {
303         __ocfs2_get_extent_tree(et, inode, bh, xv,
304                                 &ocfs2_xattr_value_et_ops);
305 }
306
307 void ocfs2_put_extent_tree(struct ocfs2_extent_tree *et)
308 {
309         brelse(et->et_root_bh);
310 }
311
312 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
313                                             u64 new_last_eb_blk)
314 {
315         et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
316 }
317
318 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
319 {
320         return et->et_ops->eo_get_last_eb_blk(et);
321 }
322
323 static inline void ocfs2_et_update_clusters(struct inode *inode,
324                                             struct ocfs2_extent_tree *et,
325                                             u32 clusters)
326 {
327         et->et_ops->eo_update_clusters(inode, et, clusters);
328 }
329
330 static inline int ocfs2_et_insert_check(struct inode *inode,
331                                         struct ocfs2_extent_tree *et,
332                                         struct ocfs2_extent_rec *rec)
333 {
334         int ret = 0;
335
336         if (et->et_ops->eo_insert_check)
337                 ret = et->et_ops->eo_insert_check(inode, et, rec);
338         return ret;
339 }
340
341 static inline int ocfs2_et_sanity_check(struct inode *inode,
342                                         struct ocfs2_extent_tree *et)
343 {
344         int ret = 0;
345
346         if (et->et_ops->eo_sanity_check)
347                 ret = et->et_ops->eo_sanity_check(inode, et);
348         return ret;
349 }
350
351 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
352 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
353                                          struct ocfs2_extent_block *eb);
354
355 /*
356  * Structures which describe a path through a btree, and functions to
357  * manipulate them.
358  *
359  * The idea here is to be as generic as possible with the tree
360  * manipulation code.
361  */
362 struct ocfs2_path_item {
363         struct buffer_head              *bh;
364         struct ocfs2_extent_list        *el;
365 };
366
367 #define OCFS2_MAX_PATH_DEPTH    5
368
369 struct ocfs2_path {
370         int                     p_tree_depth;
371         struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
372 };
373
374 #define path_root_bh(_path) ((_path)->p_node[0].bh)
375 #define path_root_el(_path) ((_path)->p_node[0].el)
376 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
377 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
378 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
379
380 /*
381  * Reset the actual path elements so that we can re-use the structure
382  * to build another path. Generally, this involves freeing the buffer
383  * heads.
384  */
385 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
386 {
387         int i, start = 0, depth = 0;
388         struct ocfs2_path_item *node;
389
390         if (keep_root)
391                 start = 1;
392
393         for(i = start; i < path_num_items(path); i++) {
394                 node = &path->p_node[i];
395
396                 brelse(node->bh);
397                 node->bh = NULL;
398                 node->el = NULL;
399         }
400
401         /*
402          * Tree depth may change during truncate, or insert. If we're
403          * keeping the root extent list, then make sure that our path
404          * structure reflects the proper depth.
405          */
406         if (keep_root)
407                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
408
409         path->p_tree_depth = depth;
410 }
411
412 static void ocfs2_free_path(struct ocfs2_path *path)
413 {
414         if (path) {
415                 ocfs2_reinit_path(path, 0);
416                 kfree(path);
417         }
418 }
419
420 /*
421  * All the elements of src into dest. After this call, src could be freed
422  * without affecting dest.
423  *
424  * Both paths should have the same root. Any non-root elements of dest
425  * will be freed.
426  */
427 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
428 {
429         int i;
430
431         BUG_ON(path_root_bh(dest) != path_root_bh(src));
432         BUG_ON(path_root_el(dest) != path_root_el(src));
433
434         ocfs2_reinit_path(dest, 1);
435
436         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
437                 dest->p_node[i].bh = src->p_node[i].bh;
438                 dest->p_node[i].el = src->p_node[i].el;
439
440                 if (dest->p_node[i].bh)
441                         get_bh(dest->p_node[i].bh);
442         }
443 }
444
445 /*
446  * Make the *dest path the same as src and re-initialize src path to
447  * have a root only.
448  */
449 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
450 {
451         int i;
452
453         BUG_ON(path_root_bh(dest) != path_root_bh(src));
454
455         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
456                 brelse(dest->p_node[i].bh);
457
458                 dest->p_node[i].bh = src->p_node[i].bh;
459                 dest->p_node[i].el = src->p_node[i].el;
460
461                 src->p_node[i].bh = NULL;
462                 src->p_node[i].el = NULL;
463         }
464 }
465
466 /*
467  * Insert an extent block at given index.
468  *
469  * This will not take an additional reference on eb_bh.
470  */
471 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
472                                         struct buffer_head *eb_bh)
473 {
474         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
475
476         /*
477          * Right now, no root bh is an extent block, so this helps
478          * catch code errors with dinode trees. The assertion can be
479          * safely removed if we ever need to insert extent block
480          * structures at the root.
481          */
482         BUG_ON(index == 0);
483
484         path->p_node[index].bh = eb_bh;
485         path->p_node[index].el = &eb->h_list;
486 }
487
488 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
489                                          struct ocfs2_extent_list *root_el)
490 {
491         struct ocfs2_path *path;
492
493         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
494
495         path = kzalloc(sizeof(*path), GFP_NOFS);
496         if (path) {
497                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
498                 get_bh(root_bh);
499                 path_root_bh(path) = root_bh;
500                 path_root_el(path) = root_el;
501         }
502
503         return path;
504 }
505
506 /*
507  * Convenience function to journal all components in a path.
508  */
509 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
510                                      struct ocfs2_path *path)
511 {
512         int i, ret = 0;
513
514         if (!path)
515                 goto out;
516
517         for(i = 0; i < path_num_items(path); i++) {
518                 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
519                                            OCFS2_JOURNAL_ACCESS_WRITE);
520                 if (ret < 0) {
521                         mlog_errno(ret);
522                         goto out;
523                 }
524         }
525
526 out:
527         return ret;
528 }
529
530 /*
531  * Return the index of the extent record which contains cluster #v_cluster.
532  * -1 is returned if it was not found.
533  *
534  * Should work fine on interior and exterior nodes.
535  */
536 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
537 {
538         int ret = -1;
539         int i;
540         struct ocfs2_extent_rec *rec;
541         u32 rec_end, rec_start, clusters;
542
543         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
544                 rec = &el->l_recs[i];
545
546                 rec_start = le32_to_cpu(rec->e_cpos);
547                 clusters = ocfs2_rec_clusters(el, rec);
548
549                 rec_end = rec_start + clusters;
550
551                 if (v_cluster >= rec_start && v_cluster < rec_end) {
552                         ret = i;
553                         break;
554                 }
555         }
556
557         return ret;
558 }
559
560 enum ocfs2_contig_type {
561         CONTIG_NONE = 0,
562         CONTIG_LEFT,
563         CONTIG_RIGHT,
564         CONTIG_LEFTRIGHT,
565 };
566
567
568 /*
569  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
570  * ocfs2_extent_contig only work properly against leaf nodes!
571  */
572 static int ocfs2_block_extent_contig(struct super_block *sb,
573                                      struct ocfs2_extent_rec *ext,
574                                      u64 blkno)
575 {
576         u64 blk_end = le64_to_cpu(ext->e_blkno);
577
578         blk_end += ocfs2_clusters_to_blocks(sb,
579                                     le16_to_cpu(ext->e_leaf_clusters));
580
581         return blkno == blk_end;
582 }
583
584 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
585                                   struct ocfs2_extent_rec *right)
586 {
587         u32 left_range;
588
589         left_range = le32_to_cpu(left->e_cpos) +
590                 le16_to_cpu(left->e_leaf_clusters);
591
592         return (left_range == le32_to_cpu(right->e_cpos));
593 }
594
595 static enum ocfs2_contig_type
596         ocfs2_extent_contig(struct inode *inode,
597                             struct ocfs2_extent_rec *ext,
598                             struct ocfs2_extent_rec *insert_rec)
599 {
600         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
601
602         /*
603          * Refuse to coalesce extent records with different flag
604          * fields - we don't want to mix unwritten extents with user
605          * data.
606          */
607         if (ext->e_flags != insert_rec->e_flags)
608                 return CONTIG_NONE;
609
610         if (ocfs2_extents_adjacent(ext, insert_rec) &&
611             ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
612                         return CONTIG_RIGHT;
613
614         blkno = le64_to_cpu(ext->e_blkno);
615         if (ocfs2_extents_adjacent(insert_rec, ext) &&
616             ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
617                 return CONTIG_LEFT;
618
619         return CONTIG_NONE;
620 }
621
622 /*
623  * NOTE: We can have pretty much any combination of contiguousness and
624  * appending.
625  *
626  * The usefulness of APPEND_TAIL is more in that it lets us know that
627  * we'll have to update the path to that leaf.
628  */
629 enum ocfs2_append_type {
630         APPEND_NONE = 0,
631         APPEND_TAIL,
632 };
633
634 enum ocfs2_split_type {
635         SPLIT_NONE = 0,
636         SPLIT_LEFT,
637         SPLIT_RIGHT,
638 };
639
640 struct ocfs2_insert_type {
641         enum ocfs2_split_type   ins_split;
642         enum ocfs2_append_type  ins_appending;
643         enum ocfs2_contig_type  ins_contig;
644         int                     ins_contig_index;
645         int                     ins_tree_depth;
646 };
647
648 struct ocfs2_merge_ctxt {
649         enum ocfs2_contig_type  c_contig_type;
650         int                     c_has_empty_extent;
651         int                     c_split_covers_rec;
652 };
653
654 /*
655  * How many free extents have we got before we need more meta data?
656  */
657 int ocfs2_num_free_extents(struct ocfs2_super *osb,
658                            struct inode *inode,
659                            struct ocfs2_extent_tree *et)
660 {
661         int retval;
662         struct ocfs2_extent_list *el = NULL;
663         struct ocfs2_extent_block *eb;
664         struct buffer_head *eb_bh = NULL;
665         u64 last_eb_blk = 0;
666
667         mlog_entry_void();
668
669         el = et->et_root_el;
670         last_eb_blk = ocfs2_et_get_last_eb_blk(et);
671
672         if (last_eb_blk) {
673                 retval = ocfs2_read_block(osb, last_eb_blk,
674                                           &eb_bh, OCFS2_BH_CACHED, inode);
675                 if (retval < 0) {
676                         mlog_errno(retval);
677                         goto bail;
678                 }
679                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
680                 el = &eb->h_list;
681         }
682
683         BUG_ON(el->l_tree_depth != 0);
684
685         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
686 bail:
687         if (eb_bh)
688                 brelse(eb_bh);
689
690         mlog_exit(retval);
691         return retval;
692 }
693
694 /* expects array to already be allocated
695  *
696  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
697  * l_count for you
698  */
699 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
700                                      handle_t *handle,
701                                      struct inode *inode,
702                                      int wanted,
703                                      struct ocfs2_alloc_context *meta_ac,
704                                      struct buffer_head *bhs[])
705 {
706         int count, status, i;
707         u16 suballoc_bit_start;
708         u32 num_got;
709         u64 first_blkno;
710         struct ocfs2_extent_block *eb;
711
712         mlog_entry_void();
713
714         count = 0;
715         while (count < wanted) {
716                 status = ocfs2_claim_metadata(osb,
717                                               handle,
718                                               meta_ac,
719                                               wanted - count,
720                                               &suballoc_bit_start,
721                                               &num_got,
722                                               &first_blkno);
723                 if (status < 0) {
724                         mlog_errno(status);
725                         goto bail;
726                 }
727
728                 for(i = count;  i < (num_got + count); i++) {
729                         bhs[i] = sb_getblk(osb->sb, first_blkno);
730                         if (bhs[i] == NULL) {
731                                 status = -EIO;
732                                 mlog_errno(status);
733                                 goto bail;
734                         }
735                         ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
736
737                         status = ocfs2_journal_access(handle, inode, bhs[i],
738                                                       OCFS2_JOURNAL_ACCESS_CREATE);
739                         if (status < 0) {
740                                 mlog_errno(status);
741                                 goto bail;
742                         }
743
744                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
745                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
746                         /* Ok, setup the minimal stuff here. */
747                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
748                         eb->h_blkno = cpu_to_le64(first_blkno);
749                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
750                         eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
751                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
752                         eb->h_list.l_count =
753                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
754
755                         suballoc_bit_start++;
756                         first_blkno++;
757
758                         /* We'll also be dirtied by the caller, so
759                          * this isn't absolutely necessary. */
760                         status = ocfs2_journal_dirty(handle, bhs[i]);
761                         if (status < 0) {
762                                 mlog_errno(status);
763                                 goto bail;
764                         }
765                 }
766
767                 count += num_got;
768         }
769
770         status = 0;
771 bail:
772         if (status < 0) {
773                 for(i = 0; i < wanted; i++) {
774                         if (bhs[i])
775                                 brelse(bhs[i]);
776                         bhs[i] = NULL;
777                 }
778         }
779         mlog_exit(status);
780         return status;
781 }
782
783 /*
784  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
785  *
786  * Returns the sum of the rightmost extent rec logical offset and
787  * cluster count.
788  *
789  * ocfs2_add_branch() uses this to determine what logical cluster
790  * value should be populated into the leftmost new branch records.
791  *
792  * ocfs2_shift_tree_depth() uses this to determine the # clusters
793  * value for the new topmost tree record.
794  */
795 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
796 {
797         int i;
798
799         i = le16_to_cpu(el->l_next_free_rec) - 1;
800
801         return le32_to_cpu(el->l_recs[i].e_cpos) +
802                 ocfs2_rec_clusters(el, &el->l_recs[i]);
803 }
804
805 /*
806  * Add an entire tree branch to our inode. eb_bh is the extent block
807  * to start at, if we don't want to start the branch at the dinode
808  * structure.
809  *
810  * last_eb_bh is required as we have to update it's next_leaf pointer
811  * for the new last extent block.
812  *
813  * the new branch will be 'empty' in the sense that every block will
814  * contain a single record with cluster count == 0.
815  */
816 static int ocfs2_add_branch(struct ocfs2_super *osb,
817                             handle_t *handle,
818                             struct inode *inode,
819                             struct ocfs2_extent_tree *et,
820                             struct buffer_head *eb_bh,
821                             struct buffer_head **last_eb_bh,
822                             struct ocfs2_alloc_context *meta_ac)
823 {
824         int status, new_blocks, i;
825         u64 next_blkno, new_last_eb_blk;
826         struct buffer_head *bh;
827         struct buffer_head **new_eb_bhs = NULL;
828         struct ocfs2_extent_block *eb;
829         struct ocfs2_extent_list  *eb_el;
830         struct ocfs2_extent_list  *el;
831         u32 new_cpos;
832
833         mlog_entry_void();
834
835         BUG_ON(!last_eb_bh || !*last_eb_bh);
836
837         if (eb_bh) {
838                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
839                 el = &eb->h_list;
840         } else
841                 el = et->et_root_el;
842
843         /* we never add a branch to a leaf. */
844         BUG_ON(!el->l_tree_depth);
845
846         new_blocks = le16_to_cpu(el->l_tree_depth);
847
848         /* allocate the number of new eb blocks we need */
849         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
850                              GFP_KERNEL);
851         if (!new_eb_bhs) {
852                 status = -ENOMEM;
853                 mlog_errno(status);
854                 goto bail;
855         }
856
857         status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
858                                            meta_ac, new_eb_bhs);
859         if (status < 0) {
860                 mlog_errno(status);
861                 goto bail;
862         }
863
864         eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
865         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
866
867         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
868          * linked with the rest of the tree.
869          * conversly, new_eb_bhs[0] is the new bottommost leaf.
870          *
871          * when we leave the loop, new_last_eb_blk will point to the
872          * newest leaf, and next_blkno will point to the topmost extent
873          * block. */
874         next_blkno = new_last_eb_blk = 0;
875         for(i = 0; i < new_blocks; i++) {
876                 bh = new_eb_bhs[i];
877                 eb = (struct ocfs2_extent_block *) bh->b_data;
878                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
879                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
880                         status = -EIO;
881                         goto bail;
882                 }
883                 eb_el = &eb->h_list;
884
885                 status = ocfs2_journal_access(handle, inode, bh,
886                                               OCFS2_JOURNAL_ACCESS_CREATE);
887                 if (status < 0) {
888                         mlog_errno(status);
889                         goto bail;
890                 }
891
892                 eb->h_next_leaf_blk = 0;
893                 eb_el->l_tree_depth = cpu_to_le16(i);
894                 eb_el->l_next_free_rec = cpu_to_le16(1);
895                 /*
896                  * This actually counts as an empty extent as
897                  * c_clusters == 0
898                  */
899                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
900                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
901                 /*
902                  * eb_el isn't always an interior node, but even leaf
903                  * nodes want a zero'd flags and reserved field so
904                  * this gets the whole 32 bits regardless of use.
905                  */
906                 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
907                 if (!eb_el->l_tree_depth)
908                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
909
910                 status = ocfs2_journal_dirty(handle, bh);
911                 if (status < 0) {
912                         mlog_errno(status);
913                         goto bail;
914                 }
915
916                 next_blkno = le64_to_cpu(eb->h_blkno);
917         }
918
919         /* This is a bit hairy. We want to update up to three blocks
920          * here without leaving any of them in an inconsistent state
921          * in case of error. We don't have to worry about
922          * journal_dirty erroring as it won't unless we've aborted the
923          * handle (in which case we would never be here) so reserving
924          * the write with journal_access is all we need to do. */
925         status = ocfs2_journal_access(handle, inode, *last_eb_bh,
926                                       OCFS2_JOURNAL_ACCESS_WRITE);
927         if (status < 0) {
928                 mlog_errno(status);
929                 goto bail;
930         }
931         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
932                                       OCFS2_JOURNAL_ACCESS_WRITE);
933         if (status < 0) {
934                 mlog_errno(status);
935                 goto bail;
936         }
937         if (eb_bh) {
938                 status = ocfs2_journal_access(handle, inode, eb_bh,
939                                               OCFS2_JOURNAL_ACCESS_WRITE);
940                 if (status < 0) {
941                         mlog_errno(status);
942                         goto bail;
943                 }
944         }
945
946         /* Link the new branch into the rest of the tree (el will
947          * either be on the root_bh, or the extent block passed in. */
948         i = le16_to_cpu(el->l_next_free_rec);
949         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
950         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
951         el->l_recs[i].e_int_clusters = 0;
952         le16_add_cpu(&el->l_next_free_rec, 1);
953
954         /* fe needs a new last extent block pointer, as does the
955          * next_leaf on the previously last-extent-block. */
956         ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
957
958         eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
959         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
960
961         status = ocfs2_journal_dirty(handle, *last_eb_bh);
962         if (status < 0)
963                 mlog_errno(status);
964         status = ocfs2_journal_dirty(handle, et->et_root_bh);
965         if (status < 0)
966                 mlog_errno(status);
967         if (eb_bh) {
968                 status = ocfs2_journal_dirty(handle, eb_bh);
969                 if (status < 0)
970                         mlog_errno(status);
971         }
972
973         /*
974          * Some callers want to track the rightmost leaf so pass it
975          * back here.
976          */
977         brelse(*last_eb_bh);
978         get_bh(new_eb_bhs[0]);
979         *last_eb_bh = new_eb_bhs[0];
980
981         status = 0;
982 bail:
983         if (new_eb_bhs) {
984                 for (i = 0; i < new_blocks; i++)
985                         if (new_eb_bhs[i])
986                                 brelse(new_eb_bhs[i]);
987                 kfree(new_eb_bhs);
988         }
989
990         mlog_exit(status);
991         return status;
992 }
993
994 /*
995  * adds another level to the allocation tree.
996  * returns back the new extent block so you can add a branch to it
997  * after this call.
998  */
999 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1000                                   handle_t *handle,
1001                                   struct inode *inode,
1002                                   struct ocfs2_extent_tree *et,
1003                                   struct ocfs2_alloc_context *meta_ac,
1004                                   struct buffer_head **ret_new_eb_bh)
1005 {
1006         int status, i;
1007         u32 new_clusters;
1008         struct buffer_head *new_eb_bh = NULL;
1009         struct ocfs2_extent_block *eb;
1010         struct ocfs2_extent_list  *root_el;
1011         struct ocfs2_extent_list  *eb_el;
1012
1013         mlog_entry_void();
1014
1015         status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
1016                                            &new_eb_bh);
1017         if (status < 0) {
1018                 mlog_errno(status);
1019                 goto bail;
1020         }
1021
1022         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1023         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1024                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1025                 status = -EIO;
1026                 goto bail;
1027         }
1028
1029         eb_el = &eb->h_list;
1030         root_el = et->et_root_el;
1031
1032         status = ocfs2_journal_access(handle, inode, new_eb_bh,
1033                                       OCFS2_JOURNAL_ACCESS_CREATE);
1034         if (status < 0) {
1035                 mlog_errno(status);
1036                 goto bail;
1037         }
1038
1039         /* copy the root extent list data into the new extent block */
1040         eb_el->l_tree_depth = root_el->l_tree_depth;
1041         eb_el->l_next_free_rec = root_el->l_next_free_rec;
1042         for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1043                 eb_el->l_recs[i] = root_el->l_recs[i];
1044
1045         status = ocfs2_journal_dirty(handle, new_eb_bh);
1046         if (status < 0) {
1047                 mlog_errno(status);
1048                 goto bail;
1049         }
1050
1051         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
1052                                       OCFS2_JOURNAL_ACCESS_WRITE);
1053         if (status < 0) {
1054                 mlog_errno(status);
1055                 goto bail;
1056         }
1057
1058         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1059
1060         /* update root_bh now */
1061         le16_add_cpu(&root_el->l_tree_depth, 1);
1062         root_el->l_recs[0].e_cpos = 0;
1063         root_el->l_recs[0].e_blkno = eb->h_blkno;
1064         root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1065         for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1066                 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1067         root_el->l_next_free_rec = cpu_to_le16(1);
1068
1069         /* If this is our 1st tree depth shift, then last_eb_blk
1070          * becomes the allocated extent block */
1071         if (root_el->l_tree_depth == cpu_to_le16(1))
1072                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1073
1074         status = ocfs2_journal_dirty(handle, et->et_root_bh);
1075         if (status < 0) {
1076                 mlog_errno(status);
1077                 goto bail;
1078         }
1079
1080         *ret_new_eb_bh = new_eb_bh;
1081         new_eb_bh = NULL;
1082         status = 0;
1083 bail:
1084         if (new_eb_bh)
1085                 brelse(new_eb_bh);
1086
1087         mlog_exit(status);
1088         return status;
1089 }
1090
1091 /*
1092  * Should only be called when there is no space left in any of the
1093  * leaf nodes. What we want to do is find the lowest tree depth
1094  * non-leaf extent block with room for new records. There are three
1095  * valid results of this search:
1096  *
1097  * 1) a lowest extent block is found, then we pass it back in
1098  *    *lowest_eb_bh and return '0'
1099  *
1100  * 2) the search fails to find anything, but the root_el has room. We
1101  *    pass NULL back in *lowest_eb_bh, but still return '0'
1102  *
1103  * 3) the search fails to find anything AND the root_el is full, in
1104  *    which case we return > 0
1105  *
1106  * return status < 0 indicates an error.
1107  */
1108 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1109                                     struct inode *inode,
1110                                     struct ocfs2_extent_tree *et,
1111                                     struct buffer_head **target_bh)
1112 {
1113         int status = 0, i;
1114         u64 blkno;
1115         struct ocfs2_extent_block *eb;
1116         struct ocfs2_extent_list  *el;
1117         struct buffer_head *bh = NULL;
1118         struct buffer_head *lowest_bh = NULL;
1119
1120         mlog_entry_void();
1121
1122         *target_bh = NULL;
1123
1124         el = et->et_root_el;
1125
1126         while(le16_to_cpu(el->l_tree_depth) > 1) {
1127                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1128                         ocfs2_error(inode->i_sb, "Dinode %llu has empty "
1129                                     "extent list (next_free_rec == 0)",
1130                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
1131                         status = -EIO;
1132                         goto bail;
1133                 }
1134                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1135                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1136                 if (!blkno) {
1137                         ocfs2_error(inode->i_sb, "Dinode %llu has extent "
1138                                     "list where extent # %d has no physical "
1139                                     "block start",
1140                                     (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
1141                         status = -EIO;
1142                         goto bail;
1143                 }
1144
1145                 if (bh) {
1146                         brelse(bh);
1147                         bh = NULL;
1148                 }
1149
1150                 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
1151                                           inode);
1152                 if (status < 0) {
1153                         mlog_errno(status);
1154                         goto bail;
1155                 }
1156
1157                 eb = (struct ocfs2_extent_block *) bh->b_data;
1158                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1159                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1160                         status = -EIO;
1161                         goto bail;
1162                 }
1163                 el = &eb->h_list;
1164
1165                 if (le16_to_cpu(el->l_next_free_rec) <
1166                     le16_to_cpu(el->l_count)) {
1167                         if (lowest_bh)
1168                                 brelse(lowest_bh);
1169                         lowest_bh = bh;
1170                         get_bh(lowest_bh);
1171                 }
1172         }
1173
1174         /* If we didn't find one and the fe doesn't have any room,
1175          * then return '1' */
1176         el = et->et_root_el;
1177         if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1178                 status = 1;
1179
1180         *target_bh = lowest_bh;
1181 bail:
1182         if (bh)
1183                 brelse(bh);
1184
1185         mlog_exit(status);
1186         return status;
1187 }
1188
1189 /*
1190  * Grow a b-tree so that it has more records.
1191  *
1192  * We might shift the tree depth in which case existing paths should
1193  * be considered invalid.
1194  *
1195  * Tree depth after the grow is returned via *final_depth.
1196  *
1197  * *last_eb_bh will be updated by ocfs2_add_branch().
1198  */
1199 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1200                            struct ocfs2_extent_tree *et, int *final_depth,
1201                            struct buffer_head **last_eb_bh,
1202                            struct ocfs2_alloc_context *meta_ac)
1203 {
1204         int ret, shift;
1205         struct ocfs2_extent_list *el = et->et_root_el;
1206         int depth = le16_to_cpu(el->l_tree_depth);
1207         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1208         struct buffer_head *bh = NULL;
1209
1210         BUG_ON(meta_ac == NULL);
1211
1212         shift = ocfs2_find_branch_target(osb, inode, et, &bh);
1213         if (shift < 0) {
1214                 ret = shift;
1215                 mlog_errno(ret);
1216                 goto out;
1217         }
1218
1219         /* We traveled all the way to the bottom of the allocation tree
1220          * and didn't find room for any more extents - we need to add
1221          * another tree level */
1222         if (shift) {
1223                 BUG_ON(bh);
1224                 mlog(0, "need to shift tree depth (current = %d)\n", depth);
1225
1226                 /* ocfs2_shift_tree_depth will return us a buffer with
1227                  * the new extent block (so we can pass that to
1228                  * ocfs2_add_branch). */
1229                 ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
1230                                              meta_ac, &bh);
1231                 if (ret < 0) {
1232                         mlog_errno(ret);
1233                         goto out;
1234                 }
1235                 depth++;
1236                 if (depth == 1) {
1237                         /*
1238                          * Special case: we have room now if we shifted from
1239                          * tree_depth 0, so no more work needs to be done.
1240                          *
1241                          * We won't be calling add_branch, so pass
1242                          * back *last_eb_bh as the new leaf. At depth
1243                          * zero, it should always be null so there's
1244                          * no reason to brelse.
1245                          */
1246                         BUG_ON(*last_eb_bh);
1247                         get_bh(bh);
1248                         *last_eb_bh = bh;
1249                         goto out;
1250                 }
1251         }
1252
1253         /* call ocfs2_add_branch to add the final part of the tree with
1254          * the new data. */
1255         mlog(0, "add branch. bh = %p\n", bh);
1256         ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
1257                                meta_ac);
1258         if (ret < 0) {
1259                 mlog_errno(ret);
1260                 goto out;
1261         }
1262
1263 out:
1264         if (final_depth)
1265                 *final_depth = depth;
1266         brelse(bh);
1267         return ret;
1268 }
1269
1270 /*
1271  * This function will discard the rightmost extent record.
1272  */
1273 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1274 {
1275         int next_free = le16_to_cpu(el->l_next_free_rec);
1276         int count = le16_to_cpu(el->l_count);
1277         unsigned int num_bytes;
1278
1279         BUG_ON(!next_free);
1280         /* This will cause us to go off the end of our extent list. */
1281         BUG_ON(next_free >= count);
1282
1283         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1284
1285         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1286 }
1287
1288 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1289                               struct ocfs2_extent_rec *insert_rec)
1290 {
1291         int i, insert_index, next_free, has_empty, num_bytes;
1292         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1293         struct ocfs2_extent_rec *rec;
1294
1295         next_free = le16_to_cpu(el->l_next_free_rec);
1296         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1297
1298         BUG_ON(!next_free);
1299
1300         /* The tree code before us didn't allow enough room in the leaf. */
1301         BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1302
1303         /*
1304          * The easiest way to approach this is to just remove the
1305          * empty extent and temporarily decrement next_free.
1306          */
1307         if (has_empty) {
1308                 /*
1309                  * If next_free was 1 (only an empty extent), this
1310                  * loop won't execute, which is fine. We still want
1311                  * the decrement above to happen.
1312                  */
1313                 for(i = 0; i < (next_free - 1); i++)
1314                         el->l_recs[i] = el->l_recs[i+1];
1315
1316                 next_free--;
1317         }
1318
1319         /*
1320          * Figure out what the new record index should be.
1321          */
1322         for(i = 0; i < next_free; i++) {
1323                 rec = &el->l_recs[i];
1324
1325                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
1326                         break;
1327         }
1328         insert_index = i;
1329
1330         mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1331              insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1332
1333         BUG_ON(insert_index < 0);
1334         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1335         BUG_ON(insert_index > next_free);
1336
1337         /*
1338          * No need to memmove if we're just adding to the tail.
1339          */
1340         if (insert_index != next_free) {
1341                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
1342
1343                 num_bytes = next_free - insert_index;
1344                 num_bytes *= sizeof(struct ocfs2_extent_rec);
1345                 memmove(&el->l_recs[insert_index + 1],
1346                         &el->l_recs[insert_index],
1347                         num_bytes);
1348         }
1349
1350         /*
1351          * Either we had an empty extent, and need to re-increment or
1352          * there was no empty extent on a non full rightmost leaf node,
1353          * in which case we still need to increment.
1354          */
1355         next_free++;
1356         el->l_next_free_rec = cpu_to_le16(next_free);
1357         /*
1358          * Make sure none of the math above just messed up our tree.
1359          */
1360         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1361
1362         el->l_recs[insert_index] = *insert_rec;
1363
1364 }
1365
1366 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1367 {
1368         int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1369
1370         BUG_ON(num_recs == 0);
1371
1372         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1373                 num_recs--;
1374                 size = num_recs * sizeof(struct ocfs2_extent_rec);
1375                 memmove(&el->l_recs[0], &el->l_recs[1], size);
1376                 memset(&el->l_recs[num_recs], 0,
1377                        sizeof(struct ocfs2_extent_rec));
1378                 el->l_next_free_rec = cpu_to_le16(num_recs);
1379         }
1380 }
1381
1382 /*
1383  * Create an empty extent record .
1384  *
1385  * l_next_free_rec may be updated.
1386  *
1387  * If an empty extent already exists do nothing.
1388  */
1389 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1390 {
1391         int next_free = le16_to_cpu(el->l_next_free_rec);
1392
1393         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1394
1395         if (next_free == 0)
1396                 goto set_and_inc;
1397
1398         if (ocfs2_is_empty_extent(&el->l_recs[0]))
1399                 return;
1400
1401         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1402                         "Asked to create an empty extent in a full list:\n"
1403                         "count = %u, tree depth = %u",
1404                         le16_to_cpu(el->l_count),
1405                         le16_to_cpu(el->l_tree_depth));
1406
1407         ocfs2_shift_records_right(el);
1408
1409 set_and_inc:
1410         le16_add_cpu(&el->l_next_free_rec, 1);
1411         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1412 }
1413
1414 /*
1415  * For a rotation which involves two leaf nodes, the "root node" is
1416  * the lowest level tree node which contains a path to both leafs. This
1417  * resulting set of information can be used to form a complete "subtree"
1418  *
1419  * This function is passed two full paths from the dinode down to a
1420  * pair of adjacent leaves. It's task is to figure out which path
1421  * index contains the subtree root - this can be the root index itself
1422  * in a worst-case rotation.
1423  *
1424  * The array index of the subtree root is passed back.
1425  */
1426 static int ocfs2_find_subtree_root(struct inode *inode,
1427                                    struct ocfs2_path *left,
1428                                    struct ocfs2_path *right)
1429 {
1430         int i = 0;
1431
1432         /*
1433          * Check that the caller passed in two paths from the same tree.
1434          */
1435         BUG_ON(path_root_bh(left) != path_root_bh(right));
1436
1437         do {
1438                 i++;
1439
1440                 /*
1441                  * The caller didn't pass two adjacent paths.
1442                  */
1443                 mlog_bug_on_msg(i > left->p_tree_depth,
1444                                 "Inode %lu, left depth %u, right depth %u\n"
1445                                 "left leaf blk %llu, right leaf blk %llu\n",
1446                                 inode->i_ino, left->p_tree_depth,
1447                                 right->p_tree_depth,
1448                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1449                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1450         } while (left->p_node[i].bh->b_blocknr ==
1451                  right->p_node[i].bh->b_blocknr);
1452
1453         return i - 1;
1454 }
1455
1456 typedef void (path_insert_t)(void *, struct buffer_head *);
1457
1458 /*
1459  * Traverse a btree path in search of cpos, starting at root_el.
1460  *
1461  * This code can be called with a cpos larger than the tree, in which
1462  * case it will return the rightmost path.
1463  */
1464 static int __ocfs2_find_path(struct inode *inode,
1465                              struct ocfs2_extent_list *root_el, u32 cpos,
1466                              path_insert_t *func, void *data)
1467 {
1468         int i, ret = 0;
1469         u32 range;
1470         u64 blkno;
1471         struct buffer_head *bh = NULL;
1472         struct ocfs2_extent_block *eb;
1473         struct ocfs2_extent_list *el;
1474         struct ocfs2_extent_rec *rec;
1475         struct ocfs2_inode_info *oi = OCFS2_I(inode);
1476
1477         el = root_el;
1478         while (el->l_tree_depth) {
1479                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1480                         ocfs2_error(inode->i_sb,
1481                                     "Inode %llu has empty extent list at "
1482                                     "depth %u\n",
1483                                     (unsigned long long)oi->ip_blkno,
1484                                     le16_to_cpu(el->l_tree_depth));
1485                         ret = -EROFS;
1486                         goto out;
1487
1488                 }
1489
1490                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1491                         rec = &el->l_recs[i];
1492
1493                         /*
1494                          * In the case that cpos is off the allocation
1495                          * tree, this should just wind up returning the
1496                          * rightmost record.
1497                          */
1498                         range = le32_to_cpu(rec->e_cpos) +
1499                                 ocfs2_rec_clusters(el, rec);
1500                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1501                             break;
1502                 }
1503
1504                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1505                 if (blkno == 0) {
1506                         ocfs2_error(inode->i_sb,
1507                                     "Inode %llu has bad blkno in extent list "
1508                                     "at depth %u (index %d)\n",
1509                                     (unsigned long long)oi->ip_blkno,
1510                                     le16_to_cpu(el->l_tree_depth), i);
1511                         ret = -EROFS;
1512                         goto out;
1513                 }
1514
1515                 brelse(bh);
1516                 bh = NULL;
1517                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1518                                        &bh, OCFS2_BH_CACHED, inode);
1519                 if (ret) {
1520                         mlog_errno(ret);
1521                         goto out;
1522                 }
1523
1524                 eb = (struct ocfs2_extent_block *) bh->b_data;
1525                 el = &eb->h_list;
1526                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1527                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1528                         ret = -EIO;
1529                         goto out;
1530                 }
1531
1532                 if (le16_to_cpu(el->l_next_free_rec) >
1533                     le16_to_cpu(el->l_count)) {
1534                         ocfs2_error(inode->i_sb,
1535                                     "Inode %llu has bad count in extent list "
1536                                     "at block %llu (next free=%u, count=%u)\n",
1537                                     (unsigned long long)oi->ip_blkno,
1538                                     (unsigned long long)bh->b_blocknr,
1539                                     le16_to_cpu(el->l_next_free_rec),
1540                                     le16_to_cpu(el->l_count));
1541                         ret = -EROFS;
1542                         goto out;
1543                 }
1544
1545                 if (func)
1546                         func(data, bh);
1547         }
1548
1549 out:
1550         /*
1551          * Catch any trailing bh that the loop didn't handle.
1552          */
1553         brelse(bh);
1554
1555         return ret;
1556 }
1557
1558 /*
1559  * Given an initialized path (that is, it has a valid root extent
1560  * list), this function will traverse the btree in search of the path
1561  * which would contain cpos.
1562  *
1563  * The path traveled is recorded in the path structure.
1564  *
1565  * Note that this will not do any comparisons on leaf node extent
1566  * records, so it will work fine in the case that we just added a tree
1567  * branch.
1568  */
1569 struct find_path_data {
1570         int index;
1571         struct ocfs2_path *path;
1572 };
1573 static void find_path_ins(void *data, struct buffer_head *bh)
1574 {
1575         struct find_path_data *fp = data;
1576
1577         get_bh(bh);
1578         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1579         fp->index++;
1580 }
1581 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1582                            u32 cpos)
1583 {
1584         struct find_path_data data;
1585
1586         data.index = 1;
1587         data.path = path;
1588         return __ocfs2_find_path(inode, path_root_el(path), cpos,
1589                                  find_path_ins, &data);
1590 }
1591
1592 static void find_leaf_ins(void *data, struct buffer_head *bh)
1593 {
1594         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1595         struct ocfs2_extent_list *el = &eb->h_list;
1596         struct buffer_head **ret = data;
1597
1598         /* We want to retain only the leaf block. */
1599         if (le16_to_cpu(el->l_tree_depth) == 0) {
1600                 get_bh(bh);
1601                 *ret = bh;
1602         }
1603 }
1604 /*
1605  * Find the leaf block in the tree which would contain cpos. No
1606  * checking of the actual leaf is done.
1607  *
1608  * Some paths want to call this instead of allocating a path structure
1609  * and calling ocfs2_find_path().
1610  *
1611  * This function doesn't handle non btree extent lists.
1612  */
1613 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1614                     u32 cpos, struct buffer_head **leaf_bh)
1615 {
1616         int ret;
1617         struct buffer_head *bh = NULL;
1618
1619         ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1620         if (ret) {
1621                 mlog_errno(ret);
1622                 goto out;
1623         }
1624
1625         *leaf_bh = bh;
1626 out:
1627         return ret;
1628 }
1629
1630 /*
1631  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1632  *
1633  * Basically, we've moved stuff around at the bottom of the tree and
1634  * we need to fix up the extent records above the changes to reflect
1635  * the new changes.
1636  *
1637  * left_rec: the record on the left.
1638  * left_child_el: is the child list pointed to by left_rec
1639  * right_rec: the record to the right of left_rec
1640  * right_child_el: is the child list pointed to by right_rec
1641  *
1642  * By definition, this only works on interior nodes.
1643  */
1644 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1645                                   struct ocfs2_extent_list *left_child_el,
1646                                   struct ocfs2_extent_rec *right_rec,
1647                                   struct ocfs2_extent_list *right_child_el)
1648 {
1649         u32 left_clusters, right_end;
1650
1651         /*
1652          * Interior nodes never have holes. Their cpos is the cpos of
1653          * the leftmost record in their child list. Their cluster
1654          * count covers the full theoretical range of their child list
1655          * - the range between their cpos and the cpos of the record
1656          * immediately to their right.
1657          */
1658         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1659         if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1660                 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1661                 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1662         }
1663         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1664         left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1665
1666         /*
1667          * Calculate the rightmost cluster count boundary before
1668          * moving cpos - we will need to adjust clusters after
1669          * updating e_cpos to keep the same highest cluster count.
1670          */
1671         right_end = le32_to_cpu(right_rec->e_cpos);
1672         right_end += le32_to_cpu(right_rec->e_int_clusters);
1673
1674         right_rec->e_cpos = left_rec->e_cpos;
1675         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1676
1677         right_end -= le32_to_cpu(right_rec->e_cpos);
1678         right_rec->e_int_clusters = cpu_to_le32(right_end);
1679 }
1680
1681 /*
1682  * Adjust the adjacent root node records involved in a
1683  * rotation. left_el_blkno is passed in as a key so that we can easily
1684  * find it's index in the root list.
1685  */
1686 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1687                                       struct ocfs2_extent_list *left_el,
1688                                       struct ocfs2_extent_list *right_el,
1689                                       u64 left_el_blkno)
1690 {
1691         int i;
1692
1693         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1694                le16_to_cpu(left_el->l_tree_depth));
1695
1696         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1697                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1698                         break;
1699         }
1700
1701         /*
1702          * The path walking code should have never returned a root and
1703          * two paths which are not adjacent.
1704          */
1705         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1706
1707         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1708                                       &root_el->l_recs[i + 1], right_el);
1709 }
1710
1711 /*
1712  * We've changed a leaf block (in right_path) and need to reflect that
1713  * change back up the subtree.
1714  *
1715  * This happens in multiple places:
1716  *   - When we've moved an extent record from the left path leaf to the right
1717  *     path leaf to make room for an empty extent in the left path leaf.
1718  *   - When our insert into the right path leaf is at the leftmost edge
1719  *     and requires an update of the path immediately to it's left. This
1720  *     can occur at the end of some types of rotation and appending inserts.
1721  *   - When we've adjusted the last extent record in the left path leaf and the
1722  *     1st extent record in the right path leaf during cross extent block merge.
1723  */
1724 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1725                                        struct ocfs2_path *left_path,
1726                                        struct ocfs2_path *right_path,
1727                                        int subtree_index)
1728 {
1729         int ret, i, idx;
1730         struct ocfs2_extent_list *el, *left_el, *right_el;
1731         struct ocfs2_extent_rec *left_rec, *right_rec;
1732         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1733
1734         /*
1735          * Update the counts and position values within all the
1736          * interior nodes to reflect the leaf rotation we just did.
1737          *
1738          * The root node is handled below the loop.
1739          *
1740          * We begin the loop with right_el and left_el pointing to the
1741          * leaf lists and work our way up.
1742          *
1743          * NOTE: within this loop, left_el and right_el always refer
1744          * to the *child* lists.
1745          */
1746         left_el = path_leaf_el(left_path);
1747         right_el = path_leaf_el(right_path);
1748         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1749                 mlog(0, "Adjust records at index %u\n", i);
1750
1751                 /*
1752                  * One nice property of knowing that all of these
1753                  * nodes are below the root is that we only deal with
1754                  * the leftmost right node record and the rightmost
1755                  * left node record.
1756                  */
1757                 el = left_path->p_node[i].el;
1758                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1759                 left_rec = &el->l_recs[idx];
1760
1761                 el = right_path->p_node[i].el;
1762                 right_rec = &el->l_recs[0];
1763
1764                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1765                                               right_el);
1766
1767                 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1768                 if (ret)
1769                         mlog_errno(ret);
1770
1771                 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1772                 if (ret)
1773                         mlog_errno(ret);
1774
1775                 /*
1776                  * Setup our list pointers now so that the current
1777                  * parents become children in the next iteration.
1778                  */
1779                 left_el = left_path->p_node[i].el;
1780                 right_el = right_path->p_node[i].el;
1781         }
1782
1783         /*
1784          * At the root node, adjust the two adjacent records which
1785          * begin our path to the leaves.
1786          */
1787
1788         el = left_path->p_node[subtree_index].el;
1789         left_el = left_path->p_node[subtree_index + 1].el;
1790         right_el = right_path->p_node[subtree_index + 1].el;
1791
1792         ocfs2_adjust_root_records(el, left_el, right_el,
1793                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
1794
1795         root_bh = left_path->p_node[subtree_index].bh;
1796
1797         ret = ocfs2_journal_dirty(handle, root_bh);
1798         if (ret)
1799                 mlog_errno(ret);
1800 }
1801
1802 static int ocfs2_rotate_subtree_right(struct inode *inode,
1803                                       handle_t *handle,
1804                                       struct ocfs2_path *left_path,
1805                                       struct ocfs2_path *right_path,
1806                                       int subtree_index)
1807 {
1808         int ret, i;
1809         struct buffer_head *right_leaf_bh;
1810         struct buffer_head *left_leaf_bh = NULL;
1811         struct buffer_head *root_bh;
1812         struct ocfs2_extent_list *right_el, *left_el;
1813         struct ocfs2_extent_rec move_rec;
1814
1815         left_leaf_bh = path_leaf_bh(left_path);
1816         left_el = path_leaf_el(left_path);
1817
1818         if (left_el->l_next_free_rec != left_el->l_count) {
1819                 ocfs2_error(inode->i_sb,
1820                             "Inode %llu has non-full interior leaf node %llu"
1821                             "(next free = %u)",
1822                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1823                             (unsigned long long)left_leaf_bh->b_blocknr,
1824                             le16_to_cpu(left_el->l_next_free_rec));
1825                 return -EROFS;
1826         }
1827
1828         /*
1829          * This extent block may already have an empty record, so we
1830          * return early if so.
1831          */
1832         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1833                 return 0;
1834
1835         root_bh = left_path->p_node[subtree_index].bh;
1836         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1837
1838         ret = ocfs2_journal_access(handle, inode, root_bh,
1839                                    OCFS2_JOURNAL_ACCESS_WRITE);
1840         if (ret) {
1841                 mlog_errno(ret);
1842                 goto out;
1843         }
1844
1845         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1846                 ret = ocfs2_journal_access(handle, inode,
1847                                            right_path->p_node[i].bh,
1848                                            OCFS2_JOURNAL_ACCESS_WRITE);
1849                 if (ret) {
1850                         mlog_errno(ret);
1851                         goto out;
1852                 }
1853
1854                 ret = ocfs2_journal_access(handle, inode,
1855                                            left_path->p_node[i].bh,
1856                                            OCFS2_JOURNAL_ACCESS_WRITE);
1857                 if (ret) {
1858                         mlog_errno(ret);
1859                         goto out;
1860                 }
1861         }
1862
1863         right_leaf_bh = path_leaf_bh(right_path);
1864         right_el = path_leaf_el(right_path);
1865
1866         /* This is a code error, not a disk corruption. */
1867         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1868                         "because rightmost leaf block %llu is empty\n",
1869                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1870                         (unsigned long long)right_leaf_bh->b_blocknr);
1871
1872         ocfs2_create_empty_extent(right_el);
1873
1874         ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1875         if (ret) {
1876                 mlog_errno(ret);
1877                 goto out;
1878         }
1879
1880         /* Do the copy now. */
1881         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1882         move_rec = left_el->l_recs[i];
1883         right_el->l_recs[0] = move_rec;
1884
1885         /*
1886          * Clear out the record we just copied and shift everything
1887          * over, leaving an empty extent in the left leaf.
1888          *
1889          * We temporarily subtract from next_free_rec so that the
1890          * shift will lose the tail record (which is now defunct).
1891          */
1892         le16_add_cpu(&left_el->l_next_free_rec, -1);
1893         ocfs2_shift_records_right(left_el);
1894         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1895         le16_add_cpu(&left_el->l_next_free_rec, 1);
1896
1897         ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1898         if (ret) {
1899                 mlog_errno(ret);
1900                 goto out;
1901         }
1902
1903         ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1904                                 subtree_index);
1905
1906 out:
1907         return ret;
1908 }
1909
1910 /*
1911  * Given a full path, determine what cpos value would return us a path
1912  * containing the leaf immediately to the left of the current one.
1913  *
1914  * Will return zero if the path passed in is already the leftmost path.
1915  */
1916 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1917                                          struct ocfs2_path *path, u32 *cpos)
1918 {
1919         int i, j, ret = 0;
1920         u64 blkno;
1921         struct ocfs2_extent_list *el;
1922
1923         BUG_ON(path->p_tree_depth == 0);
1924
1925         *cpos = 0;
1926
1927         blkno = path_leaf_bh(path)->b_blocknr;
1928
1929         /* Start at the tree node just above the leaf and work our way up. */
1930         i = path->p_tree_depth - 1;
1931         while (i >= 0) {
1932                 el = path->p_node[i].el;
1933
1934                 /*
1935                  * Find the extent record just before the one in our
1936                  * path.
1937                  */
1938                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1939                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1940                                 if (j == 0) {
1941                                         if (i == 0) {
1942                                                 /*
1943                                                  * We've determined that the
1944                                                  * path specified is already
1945                                                  * the leftmost one - return a
1946                                                  * cpos of zero.
1947                                                  */
1948                                                 goto out;
1949                                         }
1950                                         /*
1951                                          * The leftmost record points to our
1952                                          * leaf - we need to travel up the
1953                                          * tree one level.
1954                                          */
1955                                         goto next_node;
1956                                 }
1957
1958                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1959                                 *cpos = *cpos + ocfs2_rec_clusters(el,
1960                                                            &el->l_recs[j - 1]);
1961                                 *cpos = *cpos - 1;
1962                                 goto out;
1963                         }
1964                 }
1965
1966                 /*
1967                  * If we got here, we never found a valid node where
1968                  * the tree indicated one should be.
1969                  */
1970                 ocfs2_error(sb,
1971                             "Invalid extent tree at extent block %llu\n",
1972                             (unsigned long long)blkno);
1973                 ret = -EROFS;
1974                 goto out;
1975
1976 next_node:
1977                 blkno = path->p_node[i].bh->b_blocknr;
1978                 i--;
1979         }
1980
1981 out:
1982         return ret;
1983 }
1984
1985 /*
1986  * Extend the transaction by enough credits to complete the rotation,
1987  * and still leave at least the original number of credits allocated
1988  * to this transaction.
1989  */
1990 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1991                                            int op_credits,
1992                                            struct ocfs2_path *path)
1993 {
1994         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
1995
1996         if (handle->h_buffer_credits < credits)
1997                 return ocfs2_extend_trans(handle, credits);
1998
1999         return 0;
2000 }
2001
2002 /*
2003  * Trap the case where we're inserting into the theoretical range past
2004  * the _actual_ left leaf range. Otherwise, we'll rotate a record
2005  * whose cpos is less than ours into the right leaf.
2006  *
2007  * It's only necessary to look at the rightmost record of the left
2008  * leaf because the logic that calls us should ensure that the
2009  * theoretical ranges in the path components above the leaves are
2010  * correct.
2011  */
2012 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2013                                                  u32 insert_cpos)
2014 {
2015         struct ocfs2_extent_list *left_el;
2016         struct ocfs2_extent_rec *rec;
2017         int next_free;
2018
2019         left_el = path_leaf_el(left_path);
2020         next_free = le16_to_cpu(left_el->l_next_free_rec);
2021         rec = &left_el->l_recs[next_free - 1];
2022
2023         if (insert_cpos > le32_to_cpu(rec->e_cpos))
2024                 return 1;
2025         return 0;
2026 }
2027
2028 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2029 {
2030         int next_free = le16_to_cpu(el->l_next_free_rec);
2031         unsigned int range;
2032         struct ocfs2_extent_rec *rec;
2033
2034         if (next_free == 0)
2035                 return 0;
2036
2037         rec = &el->l_recs[0];
2038         if (ocfs2_is_empty_extent(rec)) {
2039                 /* Empty list. */
2040                 if (next_free == 1)
2041                         return 0;
2042                 rec = &el->l_recs[1];
2043         }
2044
2045         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2046         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2047                 return 1;
2048         return 0;
2049 }
2050
2051 /*
2052  * Rotate all the records in a btree right one record, starting at insert_cpos.
2053  *
2054  * The path to the rightmost leaf should be passed in.
2055  *
2056  * The array is assumed to be large enough to hold an entire path (tree depth).
2057  *
2058  * Upon succesful return from this function:
2059  *
2060  * - The 'right_path' array will contain a path to the leaf block
2061  *   whose range contains e_cpos.
2062  * - That leaf block will have a single empty extent in list index 0.
2063  * - In the case that the rotation requires a post-insert update,
2064  *   *ret_left_path will contain a valid path which can be passed to
2065  *   ocfs2_insert_path().
2066  */
2067 static int ocfs2_rotate_tree_right(struct inode *inode,
2068                                    handle_t *handle,
2069                                    enum ocfs2_split_type split,
2070                                    u32 insert_cpos,
2071                                    struct ocfs2_path *right_path,
2072                                    struct ocfs2_path **ret_left_path)
2073 {
2074         int ret, start, orig_credits = handle->h_buffer_credits;
2075         u32 cpos;
2076         struct ocfs2_path *left_path = NULL;
2077
2078         *ret_left_path = NULL;
2079
2080         left_path = ocfs2_new_path(path_root_bh(right_path),
2081                                    path_root_el(right_path));
2082         if (!left_path) {
2083                 ret = -ENOMEM;
2084                 mlog_errno(ret);
2085                 goto out;
2086         }
2087
2088         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
2089         if (ret) {
2090                 mlog_errno(ret);
2091                 goto out;
2092         }
2093
2094         mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
2095
2096         /*
2097          * What we want to do here is:
2098          *
2099          * 1) Start with the rightmost path.
2100          *
2101          * 2) Determine a path to the leaf block directly to the left
2102          *    of that leaf.
2103          *
2104          * 3) Determine the 'subtree root' - the lowest level tree node
2105          *    which contains a path to both leaves.
2106          *
2107          * 4) Rotate the subtree.
2108          *
2109          * 5) Find the next subtree by considering the left path to be
2110          *    the new right path.
2111          *
2112          * The check at the top of this while loop also accepts
2113          * insert_cpos == cpos because cpos is only a _theoretical_
2114          * value to get us the left path - insert_cpos might very well
2115          * be filling that hole.
2116          *
2117          * Stop at a cpos of '0' because we either started at the
2118          * leftmost branch (i.e., a tree with one branch and a
2119          * rotation inside of it), or we've gone as far as we can in
2120          * rotating subtrees.
2121          */
2122         while (cpos && insert_cpos <= cpos) {
2123                 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2124                      insert_cpos, cpos);
2125
2126                 ret = ocfs2_find_path(inode, left_path, cpos);
2127                 if (ret) {
2128                         mlog_errno(ret);
2129                         goto out;
2130                 }
2131
2132                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
2133                                 path_leaf_bh(right_path),
2134                                 "Inode %lu: error during insert of %u "
2135                                 "(left path cpos %u) results in two identical "
2136                                 "paths ending at %llu\n",
2137                                 inode->i_ino, insert_cpos, cpos,
2138                                 (unsigned long long)
2139                                 path_leaf_bh(left_path)->b_blocknr);
2140
2141                 if (split == SPLIT_NONE &&
2142                     ocfs2_rotate_requires_path_adjustment(left_path,
2143                                                           insert_cpos)) {
2144
2145                         /*
2146                          * We've rotated the tree as much as we
2147                          * should. The rest is up to
2148                          * ocfs2_insert_path() to complete, after the
2149                          * record insertion. We indicate this
2150                          * situation by returning the left path.
2151                          *
2152                          * The reason we don't adjust the records here
2153                          * before the record insert is that an error
2154                          * later might break the rule where a parent
2155                          * record e_cpos will reflect the actual
2156                          * e_cpos of the 1st nonempty record of the
2157                          * child list.
2158                          */
2159                         *ret_left_path = left_path;
2160                         goto out_ret_path;
2161                 }
2162
2163                 start = ocfs2_find_subtree_root(inode, left_path, right_path);
2164
2165                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2166                      start,
2167                      (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2168                      right_path->p_tree_depth);
2169
2170                 ret = ocfs2_extend_rotate_transaction(handle, start,
2171                                                       orig_credits, right_path);
2172                 if (ret) {
2173                         mlog_errno(ret);
2174                         goto out;
2175                 }
2176
2177                 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
2178                                                  right_path, start);
2179                 if (ret) {
2180                         mlog_errno(ret);
2181                         goto out;
2182                 }
2183
2184                 if (split != SPLIT_NONE &&
2185                     ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2186                                                 insert_cpos)) {
2187                         /*
2188                          * A rotate moves the rightmost left leaf
2189                          * record over to the leftmost right leaf
2190                          * slot. If we're doing an extent split
2191                          * instead of a real insert, then we have to
2192                          * check that the extent to be split wasn't
2193                          * just moved over. If it was, then we can
2194                          * exit here, passing left_path back -
2195                          * ocfs2_split_extent() is smart enough to
2196                          * search both leaves.
2197                          */
2198                         *ret_left_path = left_path;
2199                         goto out_ret_path;
2200                 }
2201
2202                 /*
2203                  * There is no need to re-read the next right path
2204                  * as we know that it'll be our current left
2205                  * path. Optimize by copying values instead.
2206                  */
2207                 ocfs2_mv_path(right_path, left_path);
2208
2209                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
2210                                                     &cpos);
2211                 if (ret) {
2212                         mlog_errno(ret);
2213                         goto out;
2214                 }
2215         }
2216
2217 out:
2218         ocfs2_free_path(left_path);
2219
2220 out_ret_path:
2221         return ret;
2222 }
2223
2224 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2225                                       struct ocfs2_path *path)
2226 {
2227         int i, idx;
2228         struct ocfs2_extent_rec *rec;
2229         struct ocfs2_extent_list *el;
2230         struct ocfs2_extent_block *eb;
2231         u32 range;
2232
2233         /* Path should always be rightmost. */
2234         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2235         BUG_ON(eb->h_next_leaf_blk != 0ULL);
2236
2237         el = &eb->h_list;
2238         BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2239         idx = le16_to_cpu(el->l_next_free_rec) - 1;
2240         rec = &el->l_recs[idx];
2241         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2242
2243         for (i = 0; i < path->p_tree_depth; i++) {
2244                 el = path->p_node[i].el;
2245                 idx = le16_to_cpu(el->l_next_free_rec) - 1;
2246                 rec = &el->l_recs[idx];
2247
2248                 rec->e_int_clusters = cpu_to_le32(range);
2249                 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2250
2251                 ocfs2_journal_dirty(handle, path->p_node[i].bh);
2252         }
2253 }
2254
2255 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2256                               struct ocfs2_cached_dealloc_ctxt *dealloc,
2257                               struct ocfs2_path *path, int unlink_start)
2258 {
2259         int ret, i;
2260         struct ocfs2_extent_block *eb;
2261         struct ocfs2_extent_list *el;
2262         struct buffer_head *bh;
2263
2264         for(i = unlink_start; i < path_num_items(path); i++) {
2265                 bh = path->p_node[i].bh;
2266
2267                 eb = (struct ocfs2_extent_block *)bh->b_data;
2268                 /*
2269                  * Not all nodes might have had their final count
2270                  * decremented by the caller - handle this here.
2271                  */
2272                 el = &eb->h_list;
2273                 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2274                         mlog(ML_ERROR,
2275                              "Inode %llu, attempted to remove extent block "
2276                              "%llu with %u records\n",
2277                              (unsigned long long)OCFS2_I(inode)->ip_blkno,
2278                              (unsigned long long)le64_to_cpu(eb->h_blkno),
2279                              le16_to_cpu(el->l_next_free_rec));
2280
2281                         ocfs2_journal_dirty(handle, bh);
2282                         ocfs2_remove_from_cache(inode, bh);
2283                         continue;
2284                 }
2285
2286                 el->l_next_free_rec = 0;
2287                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2288
2289                 ocfs2_journal_dirty(handle, bh);
2290
2291                 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2292                 if (ret)
2293                         mlog_errno(ret);
2294
2295                 ocfs2_remove_from_cache(inode, bh);
2296         }
2297 }
2298
2299 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2300                                  struct ocfs2_path *left_path,
2301                                  struct ocfs2_path *right_path,
2302                                  int subtree_index,
2303                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
2304 {
2305         int i;
2306         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2307         struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2308         struct ocfs2_extent_list *el;
2309         struct ocfs2_extent_block *eb;
2310
2311         el = path_leaf_el(left_path);
2312
2313         eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2314
2315         for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2316                 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2317                         break;
2318
2319         BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2320
2321         memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2322         le16_add_cpu(&root_el->l_next_free_rec, -1);
2323
2324         eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2325         eb->h_next_leaf_blk = 0;
2326
2327         ocfs2_journal_dirty(handle, root_bh);
2328         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2329
2330         ocfs2_unlink_path(inode, handle, dealloc, right_path,
2331                           subtree_index + 1);
2332 }
2333
2334 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2335                                      struct ocfs2_path *left_path,
2336                                      struct ocfs2_path *right_path,
2337                                      int subtree_index,
2338                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
2339                                      int *deleted,
2340                                      struct ocfs2_extent_tree *et)
2341 {
2342         int ret, i, del_right_subtree = 0, right_has_empty = 0;
2343         struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2344         struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2345         struct ocfs2_extent_block *eb;
2346
2347         *deleted = 0;
2348
2349         right_leaf_el = path_leaf_el(right_path);
2350         left_leaf_el = path_leaf_el(left_path);
2351         root_bh = left_path->p_node[subtree_index].bh;
2352         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2353
2354         if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2355                 return 0;
2356
2357         eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2358         if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2359                 /*
2360                  * It's legal for us to proceed if the right leaf is
2361                  * the rightmost one and it has an empty extent. There
2362                  * are two cases to handle - whether the leaf will be
2363                  * empty after removal or not. If the leaf isn't empty
2364                  * then just remove the empty extent up front. The
2365                  * next block will handle empty leaves by flagging
2366                  * them for unlink.
2367                  *
2368                  * Non rightmost leaves will throw -EAGAIN and the
2369                  * caller can manually move the subtree and retry.
2370                  */
2371
2372                 if (eb->h_next_leaf_blk != 0ULL)
2373                         return -EAGAIN;
2374
2375                 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2376                         ret = ocfs2_journal_access(handle, inode,
2377                                                    path_leaf_bh(right_path),
2378                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2379                         if (ret) {
2380                                 mlog_errno(ret);
2381                                 goto out;
2382                         }
2383
2384                         ocfs2_remove_empty_extent(right_leaf_el);
2385                 } else
2386                         right_has_empty = 1;
2387         }
2388
2389         if (eb->h_next_leaf_blk == 0ULL &&
2390             le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2391                 /*
2392                  * We have to update i_last_eb_blk during the meta
2393                  * data delete.
2394                  */
2395                 ret = ocfs2_journal_access(handle, inode, et_root_bh,
2396                                            OCFS2_JOURNAL_ACCESS_WRITE);
2397                 if (ret) {
2398                         mlog_errno(ret);
2399                         goto out;
2400                 }
2401
2402                 del_right_subtree = 1;
2403         }
2404
2405         /*
2406          * Getting here with an empty extent in the right path implies
2407          * that it's the rightmost path and will be deleted.
2408          */
2409         BUG_ON(right_has_empty && !del_right_subtree);
2410
2411         ret = ocfs2_journal_access(handle, inode, root_bh,
2412                                    OCFS2_JOURNAL_ACCESS_WRITE);
2413         if (ret) {
2414                 mlog_errno(ret);
2415                 goto out;
2416         }
2417
2418         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2419                 ret = ocfs2_journal_access(handle, inode,
2420                                            right_path->p_node[i].bh,
2421                                            OCFS2_JOURNAL_ACCESS_WRITE);
2422                 if (ret) {
2423                         mlog_errno(ret);
2424                         goto out;
2425                 }
2426
2427                 ret = ocfs2_journal_access(handle, inode,
2428                                            left_path->p_node[i].bh,
2429                                            OCFS2_JOURNAL_ACCESS_WRITE);
2430                 if (ret) {
2431                         mlog_errno(ret);
2432                         goto out;
2433                 }
2434         }
2435
2436         if (!right_has_empty) {
2437                 /*
2438                  * Only do this if we're moving a real
2439                  * record. Otherwise, the action is delayed until
2440                  * after removal of the right path in which case we
2441                  * can do a simple shift to remove the empty extent.
2442                  */
2443                 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2444                 memset(&right_leaf_el->l_recs[0], 0,
2445                        sizeof(struct ocfs2_extent_rec));
2446         }
2447         if (eb->h_next_leaf_blk == 0ULL) {
2448                 /*
2449                  * Move recs over to get rid of empty extent, decrease
2450                  * next_free. This is allowed to remove the last
2451                  * extent in our leaf (setting l_next_free_rec to
2452                  * zero) - the delete code below won't care.
2453                  */
2454                 ocfs2_remove_empty_extent(right_leaf_el);
2455         }
2456
2457         ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2458         if (ret)
2459                 mlog_errno(ret);
2460         ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2461         if (ret)
2462                 mlog_errno(ret);
2463
2464         if (del_right_subtree) {
2465                 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2466                                      subtree_index, dealloc);
2467                 ocfs2_update_edge_lengths(inode, handle, left_path);
2468
2469                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2470                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2471
2472                 /*
2473                  * Removal of the extent in the left leaf was skipped
2474                  * above so we could delete the right path
2475                  * 1st.
2476                  */
2477                 if (right_has_empty)
2478                         ocfs2_remove_empty_extent(left_leaf_el);
2479
2480                 ret = ocfs2_journal_dirty(handle, et_root_bh);
2481                 if (ret)
2482                         mlog_errno(ret);
2483
2484                 *deleted = 1;
2485         } else
2486                 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2487                                            subtree_index);
2488
2489 out:
2490         return ret;
2491 }
2492
2493 /*
2494  * Given a full path, determine what cpos value would return us a path
2495  * containing the leaf immediately to the right of the current one.
2496  *
2497  * Will return zero if the path passed in is already the rightmost path.
2498  *
2499  * This looks similar, but is subtly different to
2500  * ocfs2_find_cpos_for_left_leaf().
2501  */
2502 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2503                                           struct ocfs2_path *path, u32 *cpos)
2504 {
2505         int i, j, ret = 0;
2506         u64 blkno;
2507         struct ocfs2_extent_list *el;
2508
2509         *cpos = 0;
2510
2511         if (path->p_tree_depth == 0)
2512                 return 0;
2513
2514         blkno = path_leaf_bh(path)->b_blocknr;
2515
2516         /* Start at the tree node just above the leaf and work our way up. */
2517         i = path->p_tree_depth - 1;
2518         while (i >= 0) {
2519                 int next_free;
2520
2521                 el = path->p_node[i].el;
2522
2523                 /*
2524                  * Find the extent record just after the one in our
2525                  * path.
2526                  */
2527                 next_free = le16_to_cpu(el->l_next_free_rec);
2528                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2529                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2530                                 if (j == (next_free - 1)) {
2531                                         if (i == 0) {
2532                                                 /*
2533                                                  * We've determined that the
2534                                                  * path specified is already
2535                                                  * the rightmost one - return a
2536                                                  * cpos of zero.
2537                                                  */
2538                                                 goto out;
2539                                         }
2540                                         /*
2541                                          * The rightmost record points to our
2542                                          * leaf - we need to travel up the
2543                                          * tree one level.
2544                                          */
2545                                         goto next_node;
2546                                 }
2547
2548                                 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2549                                 goto out;
2550                         }
2551                 }
2552
2553                 /*
2554                  * If we got here, we never found a valid node where
2555                  * the tree indicated one should be.
2556                  */
2557                 ocfs2_error(sb,
2558                             "Invalid extent tree at extent block %llu\n",
2559                             (unsigned long long)blkno);
2560                 ret = -EROFS;
2561                 goto out;
2562
2563 next_node:
2564                 blkno = path->p_node[i].bh->b_blocknr;
2565                 i--;
2566         }
2567
2568 out:
2569         return ret;
2570 }
2571
2572 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2573                                             handle_t *handle,
2574                                             struct buffer_head *bh,
2575                                             struct ocfs2_extent_list *el)
2576 {
2577         int ret;
2578
2579         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2580                 return 0;
2581
2582         ret = ocfs2_journal_access(handle, inode, bh,
2583                                    OCFS2_JOURNAL_ACCESS_WRITE);
2584         if (ret) {
2585                 mlog_errno(ret);
2586                 goto out;
2587         }
2588
2589         ocfs2_remove_empty_extent(el);
2590
2591         ret = ocfs2_journal_dirty(handle, bh);
2592         if (ret)
2593                 mlog_errno(ret);
2594
2595 out:
2596         return ret;
2597 }
2598
2599 static int __ocfs2_rotate_tree_left(struct inode *inode,
2600                                     handle_t *handle, int orig_credits,
2601                                     struct ocfs2_path *path,
2602                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2603                                     struct ocfs2_path **empty_extent_path,
2604                                     struct ocfs2_extent_tree *et)
2605 {
2606         int ret, subtree_root, deleted;
2607         u32 right_cpos;
2608         struct ocfs2_path *left_path = NULL;
2609         struct ocfs2_path *right_path = NULL;
2610
2611         BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2612
2613         *empty_extent_path = NULL;
2614
2615         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2616                                              &right_cpos);
2617         if (ret) {
2618                 mlog_errno(ret);
2619                 goto out;
2620         }
2621
2622         left_path = ocfs2_new_path(path_root_bh(path),
2623                                    path_root_el(path));
2624         if (!left_path) {
2625                 ret = -ENOMEM;
2626                 mlog_errno(ret);
2627                 goto out;
2628         }
2629
2630         ocfs2_cp_path(left_path, path);
2631
2632         right_path = ocfs2_new_path(path_root_bh(path),
2633                                     path_root_el(path));
2634         if (!right_path) {
2635                 ret = -ENOMEM;
2636                 mlog_errno(ret);
2637                 goto out;
2638         }
2639
2640         while (right_cpos) {
2641                 ret = ocfs2_find_path(inode, right_path, right_cpos);
2642                 if (ret) {
2643                         mlog_errno(ret);
2644                         goto out;
2645                 }
2646
2647                 subtree_root = ocfs2_find_subtree_root(inode, left_path,
2648                                                        right_path);
2649
2650                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2651                      subtree_root,
2652                      (unsigned long long)
2653                      right_path->p_node[subtree_root].bh->b_blocknr,
2654                      right_path->p_tree_depth);
2655
2656                 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2657                                                       orig_credits, left_path);
2658                 if (ret) {
2659                         mlog_errno(ret);
2660                         goto out;
2661                 }
2662
2663                 /*
2664                  * Caller might still want to make changes to the
2665                  * tree root, so re-add it to the journal here.
2666                  */
2667                 ret = ocfs2_journal_access(handle, inode,
2668                                            path_root_bh(left_path),
2669                                            OCFS2_JOURNAL_ACCESS_WRITE);
2670                 if (ret) {
2671                         mlog_errno(ret);
2672                         goto out;
2673                 }
2674
2675                 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2676                                                 right_path, subtree_root,
2677                                                 dealloc, &deleted, et);
2678                 if (ret == -EAGAIN) {
2679                         /*
2680                          * The rotation has to temporarily stop due to
2681                          * the right subtree having an empty
2682                          * extent. Pass it back to the caller for a
2683                          * fixup.
2684                          */
2685                         *empty_extent_path = right_path;
2686                         right_path = NULL;
2687                         goto out;
2688                 }
2689                 if (ret) {
2690                         mlog_errno(ret);
2691                         goto out;
2692                 }
2693
2694                 /*
2695                  * The subtree rotate might have removed records on
2696                  * the rightmost edge. If so, then rotation is
2697                  * complete.
2698                  */
2699                 if (deleted)
2700                         break;
2701
2702                 ocfs2_mv_path(left_path, right_path);
2703
2704                 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2705                                                      &right_cpos);
2706                 if (ret) {
2707                         mlog_errno(ret);
2708                         goto out;
2709                 }
2710         }
2711
2712 out:
2713         ocfs2_free_path(right_path);
2714         ocfs2_free_path(left_path);
2715
2716         return ret;
2717 }
2718
2719 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2720                                 struct ocfs2_path *path,
2721                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
2722                                 struct ocfs2_extent_tree *et)
2723 {
2724         int ret, subtree_index;
2725         u32 cpos;
2726         struct ocfs2_path *left_path = NULL;
2727         struct ocfs2_extent_block *eb;
2728         struct ocfs2_extent_list *el;
2729
2730
2731         ret = ocfs2_et_sanity_check(inode, et);
2732         if (ret)
2733                 goto out;
2734         /*
2735          * There's two ways we handle this depending on
2736          * whether path is the only existing one.
2737          */
2738         ret = ocfs2_extend_rotate_transaction(handle, 0,
2739                                               handle->h_buffer_credits,
2740                                               path);
2741         if (ret) {
2742                 mlog_errno(ret);
2743                 goto out;
2744         }
2745
2746         ret = ocfs2_journal_access_path(inode, handle, path);
2747         if (ret) {
2748                 mlog_errno(ret);
2749                 goto out;
2750         }
2751
2752         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2753         if (ret) {
2754                 mlog_errno(ret);
2755                 goto out;
2756         }
2757
2758         if (cpos) {
2759                 /*
2760                  * We have a path to the left of this one - it needs
2761                  * an update too.
2762                  */
2763                 left_path = ocfs2_new_path(path_root_bh(path),
2764                                            path_root_el(path));
2765                 if (!left_path) {
2766                         ret = -ENOMEM;
2767                         mlog_errno(ret);
2768                         goto out;
2769                 }
2770
2771                 ret = ocfs2_find_path(inode, left_path, cpos);
2772                 if (ret) {
2773                         mlog_errno(ret);
2774                         goto out;
2775                 }
2776
2777                 ret = ocfs2_journal_access_path(inode, handle, left_path);
2778                 if (ret) {
2779                         mlog_errno(ret);
2780                         goto out;
2781                 }
2782
2783                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2784
2785                 ocfs2_unlink_subtree(inode, handle, left_path, path,
2786                                      subtree_index, dealloc);
2787                 ocfs2_update_edge_lengths(inode, handle, left_path);
2788
2789                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2790                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2791         } else {
2792                 /*
2793                  * 'path' is also the leftmost path which
2794                  * means it must be the only one. This gets
2795                  * handled differently because we want to
2796                  * revert the inode back to having extents
2797                  * in-line.
2798                  */
2799                 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2800
2801                 el = et->et_root_el;
2802                 el->l_tree_depth = 0;
2803                 el->l_next_free_rec = 0;
2804                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2805
2806                 ocfs2_et_set_last_eb_blk(et, 0);
2807         }
2808
2809         ocfs2_journal_dirty(handle, path_root_bh(path));
2810
2811 out:
2812         ocfs2_free_path(left_path);
2813         return ret;
2814 }
2815
2816 /*
2817  * Left rotation of btree records.
2818  *
2819  * In many ways, this is (unsurprisingly) the opposite of right
2820  * rotation. We start at some non-rightmost path containing an empty
2821  * extent in the leaf block. The code works its way to the rightmost
2822  * path by rotating records to the left in every subtree.
2823  *
2824  * This is used by any code which reduces the number of extent records
2825  * in a leaf. After removal, an empty record should be placed in the
2826  * leftmost list position.
2827  *
2828  * This won't handle a length update of the rightmost path records if
2829  * the rightmost tree leaf record is removed so the caller is
2830  * responsible for detecting and correcting that.
2831  */
2832 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2833                                   struct ocfs2_path *path,
2834                                   struct ocfs2_cached_dealloc_ctxt *dealloc,
2835                                   struct ocfs2_extent_tree *et)
2836 {
2837         int ret, orig_credits = handle->h_buffer_credits;
2838         struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2839         struct ocfs2_extent_block *eb;
2840         struct ocfs2_extent_list *el;
2841
2842         el = path_leaf_el(path);
2843         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2844                 return 0;
2845
2846         if (path->p_tree_depth == 0) {
2847 rightmost_no_delete:
2848                 /*
2849                  * Inline extents. This is trivially handled, so do
2850                  * it up front.
2851                  */
2852                 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2853                                                        path_leaf_bh(path),
2854                                                        path_leaf_el(path));
2855                 if (ret)
2856                         mlog_errno(ret);
2857                 goto out;
2858         }
2859
2860         /*
2861          * Handle rightmost branch now. There's several cases:
2862          *  1) simple rotation leaving records in there. That's trivial.
2863          *  2) rotation requiring a branch delete - there's no more
2864          *     records left. Two cases of this:
2865          *     a) There are branches to the left.
2866          *     b) This is also the leftmost (the only) branch.
2867          *
2868          *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
2869          *  2a) we need the left branch so that we can update it with the unlink
2870          *  2b) we need to bring the inode back to inline extents.
2871          */
2872
2873         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2874         el = &eb->h_list;
2875         if (eb->h_next_leaf_blk == 0) {
2876                 /*
2877                  * This gets a bit tricky if we're going to delete the
2878                  * rightmost path. Get the other cases out of the way
2879                  * 1st.
2880                  */
2881                 if (le16_to_cpu(el->l_next_free_rec) > 1)
2882                         goto rightmost_no_delete;
2883
2884                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
2885                         ret = -EIO;
2886                         ocfs2_error(inode->i_sb,
2887                                     "Inode %llu has empty extent block at %llu",
2888                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
2889                                     (unsigned long long)le64_to_cpu(eb->h_blkno));
2890                         goto out;
2891                 }
2892
2893                 /*
2894                  * XXX: The caller can not trust "path" any more after
2895                  * this as it will have been deleted. What do we do?
2896                  *
2897                  * In theory the rotate-for-merge code will never get
2898                  * here because it'll always ask for a rotate in a
2899                  * nonempty list.
2900                  */
2901
2902                 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2903                                                   dealloc, et);
2904                 if (ret)
2905                         mlog_errno(ret);
2906                 goto out;
2907         }
2908
2909         /*
2910          * Now we can loop, remembering the path we get from -EAGAIN
2911          * and restarting from there.
2912          */
2913 try_rotate:
2914         ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2915                                        dealloc, &restart_path, et);
2916         if (ret && ret != -EAGAIN) {
2917                 mlog_errno(ret);
2918                 goto out;
2919         }
2920
2921         while (ret == -EAGAIN) {
2922                 tmp_path = restart_path;
2923                 restart_path = NULL;
2924
2925                 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2926                                                tmp_path, dealloc,
2927                                                &restart_path, et);
2928                 if (ret && ret != -EAGAIN) {
2929                         mlog_errno(ret);
2930                         goto out;
2931                 }
2932
2933                 ocfs2_free_path(tmp_path);
2934                 tmp_path = NULL;
2935
2936                 if (ret == 0)
2937                         goto try_rotate;
2938         }
2939
2940 out:
2941         ocfs2_free_path(tmp_path);
2942         ocfs2_free_path(restart_path);
2943         return ret;
2944 }
2945
2946 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2947                                 int index)
2948 {
2949         struct ocfs2_extent_rec *rec = &el->l_recs[index];
2950         unsigned int size;
2951
2952         if (rec->e_leaf_clusters == 0) {
2953                 /*
2954                  * We consumed all of the merged-from record. An empty
2955                  * extent cannot exist anywhere but the 1st array
2956                  * position, so move things over if the merged-from
2957                  * record doesn't occupy that position.
2958                  *
2959                  * This creates a new empty extent so the caller
2960                  * should be smart enough to have removed any existing
2961                  * ones.
2962                  */
2963                 if (index > 0) {
2964                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2965                         size = index * sizeof(struct ocfs2_extent_rec);
2966                         memmove(&el->l_recs[1], &el->l_recs[0], size);
2967                 }
2968
2969                 /*
2970                  * Always memset - the caller doesn't check whether it
2971                  * created an empty extent, so there could be junk in
2972                  * the other fields.
2973                  */
2974                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2975         }
2976 }
2977
2978 static int ocfs2_get_right_path(struct inode *inode,
2979                                 struct ocfs2_path *left_path,
2980                                 struct ocfs2_path **ret_right_path)
2981 {
2982         int ret;
2983         u32 right_cpos;
2984         struct ocfs2_path *right_path = NULL;
2985         struct ocfs2_extent_list *left_el;
2986
2987         *ret_right_path = NULL;
2988
2989         /* This function shouldn't be called for non-trees. */
2990         BUG_ON(left_path->p_tree_depth == 0);
2991
2992         left_el = path_leaf_el(left_path);
2993         BUG_ON(left_el->l_next_free_rec != left_el->l_count);
2994
2995         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2996                                              &right_cpos);
2997         if (ret) {
2998                 mlog_errno(ret);
2999                 goto out;
3000         }
3001
3002         /* This function shouldn't be called for the rightmost leaf. */
3003         BUG_ON(right_cpos == 0);
3004
3005         right_path = ocfs2_new_path(path_root_bh(left_path),
3006                                     path_root_el(left_path));
3007         if (!right_path) {
3008                 ret = -ENOMEM;
3009                 mlog_errno(ret);
3010                 goto out;
3011         }
3012
3013         ret = ocfs2_find_path(inode, right_path, right_cpos);
3014         if (ret) {
3015                 mlog_errno(ret);
3016                 goto out;
3017         }
3018
3019         *ret_right_path = right_path;
3020 out:
3021         if (ret)
3022                 ocfs2_free_path(right_path);
3023         return ret;
3024 }
3025
3026 /*
3027  * Remove split_rec clusters from the record at index and merge them
3028  * onto the beginning of the record "next" to it.
3029  * For index < l_count - 1, the next means the extent rec at index + 1.
3030  * For index == l_count - 1, the "next" means the 1st extent rec of the
3031  * next extent block.
3032  */
3033 static int ocfs2_merge_rec_right(struct inode *inode,
3034                                  struct ocfs2_path *left_path,
3035                                  handle_t *handle,
3036                                  struct ocfs2_extent_rec *split_rec,
3037                                  int index)
3038 {
3039         int ret, next_free, i;
3040         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3041         struct ocfs2_extent_rec *left_rec;
3042         struct ocfs2_extent_rec *right_rec;
3043         struct ocfs2_extent_list *right_el;
3044         struct ocfs2_path *right_path = NULL;
3045         int subtree_index = 0;
3046         struct ocfs2_extent_list *el = path_leaf_el(left_path);
3047         struct buffer_head *bh = path_leaf_bh(left_path);
3048         struct buffer_head *root_bh = NULL;
3049
3050         BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3051         left_rec = &el->l_recs[index];
3052
3053         if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3054             le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3055                 /* we meet with a cross extent block merge. */
3056                 ret = ocfs2_get_right_path(inode, left_path, &right_path);
3057                 if (ret) {
3058                         mlog_errno(ret);
3059                         goto out;
3060                 }
3061
3062                 right_el = path_leaf_el(right_path);
3063                 next_free = le16_to_cpu(right_el->l_next_free_rec);
3064                 BUG_ON(next_free <= 0);
3065                 right_rec = &right_el->l_recs[0];
3066                 if (ocfs2_is_empty_extent(right_rec)) {
3067                         BUG_ON(next_free <= 1);
3068                         right_rec = &right_el->l_recs[1];
3069                 }
3070
3071                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3072                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3073                        le32_to_cpu(right_rec->e_cpos));
3074
3075                 subtree_index = ocfs2_find_subtree_root(inode,
3076                                                         left_path, right_path);
3077
3078                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3079                                                       handle->h_buffer_credits,
3080                                                       right_path);
3081                 if (ret) {
3082                         mlog_errno(ret);
3083                         goto out;
3084                 }
3085
3086                 root_bh = left_path->p_node[subtree_index].bh;
3087                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3088
3089                 ret = ocfs2_journal_access(handle, inode, root_bh,
3090                                            OCFS2_JOURNAL_ACCESS_WRITE);
3091                 if (ret) {
3092                         mlog_errno(ret);
3093                         goto out;
3094                 }
3095
3096                 for (i = subtree_index + 1;
3097                      i < path_num_items(right_path); i++) {
3098                         ret = ocfs2_journal_access(handle, inode,
3099                                                    right_path->p_node[i].bh,
3100                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3101                         if (ret) {
3102                                 mlog_errno(ret);
3103                                 goto out;
3104                         }
3105
3106                         ret = ocfs2_journal_access(handle, inode,
3107                                                    left_path->p_node[i].bh,
3108                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3109                         if (ret) {
3110                                 mlog_errno(ret);
3111                                 goto out;
3112                         }
3113                 }
3114
3115         } else {
3116                 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3117                 right_rec = &el->l_recs[index + 1];
3118         }
3119
3120         ret = ocfs2_journal_access(handle, inode, bh,
3121                                    OCFS2_JOURNAL_ACCESS_WRITE);
3122         if (ret) {
3123                 mlog_errno(ret);
3124                 goto out;
3125         }
3126
3127         le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3128
3129         le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3130         le64_add_cpu(&right_rec->e_blkno,
3131                      -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3132         le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3133
3134         ocfs2_cleanup_merge(el, index);
3135
3136         ret = ocfs2_journal_dirty(handle, bh);
3137         if (ret)
3138                 mlog_errno(ret);
3139
3140         if (right_path) {
3141                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3142                 if (ret)
3143                         mlog_errno(ret);
3144
3145                 ocfs2_complete_edge_insert(inode, handle, left_path,
3146                                            right_path, subtree_index);
3147         }
3148 out:
3149         if (right_path)
3150                 ocfs2_free_path(right_path);
3151         return ret;
3152 }
3153
3154 static int ocfs2_get_left_path(struct inode *inode,
3155                                struct ocfs2_path *right_path,
3156                                struct ocfs2_path **ret_left_path)
3157 {
3158         int ret;
3159         u32 left_cpos;
3160         struct ocfs2_path *left_path = NULL;
3161
3162         *ret_left_path = NULL;
3163
3164         /* This function shouldn't be called for non-trees. */
3165         BUG_ON(right_path->p_tree_depth == 0);
3166
3167         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3168                                             right_path, &left_cpos);
3169         if (ret) {
3170                 mlog_errno(ret);
3171                 goto out;
3172         }
3173
3174         /* This function shouldn't be called for the leftmost leaf. */
3175         BUG_ON(left_cpos == 0);
3176
3177         left_path = ocfs2_new_path(path_root_bh(right_path),
3178                                    path_root_el(right_path));
3179         if (!left_path) {
3180                 ret = -ENOMEM;
3181                 mlog_errno(ret);
3182                 goto out;
3183         }
3184
3185         ret = ocfs2_find_path(inode, left_path, left_cpos);
3186         if (ret) {
3187                 mlog_errno(ret);
3188                 goto out;
3189         }
3190
3191         *ret_left_path = left_path;
3192 out:
3193         if (ret)
3194                 ocfs2_free_path(left_path);
3195         return ret;
3196 }
3197
3198 /*
3199  * Remove split_rec clusters from the record at index and merge them
3200  * onto the tail of the record "before" it.
3201  * For index > 0, the "before" means the extent rec at index - 1.
3202  *
3203  * For index == 0, the "before" means the last record of the previous
3204  * extent block. And there is also a situation that we may need to
3205  * remove the rightmost leaf extent block in the right_path and change
3206  * the right path to indicate the new rightmost path.
3207  */
3208 static int ocfs2_merge_rec_left(struct inode *inode,
3209                                 struct ocfs2_path *right_path,
3210                                 handle_t *handle,
3211                                 struct ocfs2_extent_rec *split_rec,
3212                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
3213                                 struct ocfs2_extent_tree *et,
3214                                 int index)
3215 {
3216         int ret, i, subtree_index = 0, has_empty_extent = 0;
3217         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3218         struct ocfs2_extent_rec *left_rec;
3219         struct ocfs2_extent_rec *right_rec;
3220         struct ocfs2_extent_list *el = path_leaf_el(right_path);
3221         struct buffer_head *bh = path_leaf_bh(right_path);
3222         struct buffer_head *root_bh = NULL;
3223         struct ocfs2_path *left_path = NULL;
3224         struct ocfs2_extent_list *left_el;
3225
3226         BUG_ON(index < 0);
3227
3228         right_rec = &el->l_recs[index];
3229         if (index == 0) {
3230                 /* we meet with a cross extent block merge. */
3231                 ret = ocfs2_get_left_path(inode, right_path, &left_path);
3232                 if (ret) {
3233                         mlog_errno(ret);
3234                         goto out;
3235                 }
3236
3237                 left_el = path_leaf_el(left_path);
3238                 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3239                        le16_to_cpu(left_el->l_count));
3240
3241                 left_rec = &left_el->l_recs[
3242                                 le16_to_cpu(left_el->l_next_free_rec) - 1];
3243                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3244                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3245                        le32_to_cpu(split_rec->e_cpos));
3246
3247                 subtree_index = ocfs2_find_subtree_root(inode,
3248                                                         left_path, right_path);
3249
3250                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3251                                                       handle->h_buffer_credits,
3252                                                       left_path);
3253                 if (ret) {
3254                         mlog_errno(ret);
3255                         goto out;
3256                 }
3257
3258                 root_bh = left_path->p_node[subtree_index].bh;
3259                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3260
3261                 ret = ocfs2_journal_access(handle, inode, root_bh,
3262                                            OCFS2_JOURNAL_ACCESS_WRITE);
3263                 if (ret) {
3264                         mlog_errno(ret);
3265                         goto out;
3266                 }
3267
3268                 for (i = subtree_index + 1;
3269                      i < path_num_items(right_path); i++) {
3270                         ret = ocfs2_journal_access(handle, inode,
3271                                                    right_path->p_node[i].bh,
3272                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3273                         if (ret) {
3274                                 mlog_errno(ret);
3275                                 goto out;
3276                         }
3277
3278                         ret = ocfs2_journal_access(handle, inode,
3279                                                    left_path->p_node[i].bh,
3280                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3281                         if (ret) {
3282                                 mlog_errno(ret);
3283                                 goto out;
3284                         }
3285                 }
3286         } else {
3287                 left_rec = &el->l_recs[index - 1];
3288                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3289                         has_empty_extent = 1;
3290         }
3291
3292         ret = ocfs2_journal_access(handle, inode, bh,
3293                                    OCFS2_JOURNAL_ACCESS_WRITE);
3294         if (ret) {
3295                 mlog_errno(ret);
3296                 goto out;
3297         }
3298
3299         if (has_empty_extent && index == 1) {
3300                 /*
3301                  * The easy case - we can just plop the record right in.
3302                  */
3303                 *left_rec = *split_rec;
3304
3305                 has_empty_extent = 0;
3306         } else
3307                 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3308
3309         le32_add_cpu(&right_rec->e_cpos, split_clusters);
3310         le64_add_cpu(&right_rec->e_blkno,
3311                      ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3312         le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3313
3314         ocfs2_cleanup_merge(el, index);
3315
3316         ret = ocfs2_journal_dirty(handle, bh);
3317         if (ret)
3318                 mlog_errno(ret);
3319
3320         if (left_path) {
3321                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3322                 if (ret)
3323                         mlog_errno(ret);
3324
3325                 /*
3326                  * In the situation that the right_rec is empty and the extent
3327                  * block is empty also,  ocfs2_complete_edge_insert can't handle
3328                  * it and we need to delete the right extent block.
3329                  */
3330                 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3331                     le16_to_cpu(el->l_next_free_rec) == 1) {
3332
3333                         ret = ocfs2_remove_rightmost_path(inode, handle,
3334                                                           right_path,
3335                                                           dealloc, et);
3336                         if (ret) {
3337                                 mlog_errno(ret);
3338                                 goto out;
3339                         }
3340
3341                         /* Now the rightmost extent block has been deleted.
3342                          * So we use the new rightmost path.
3343                          */
3344                         ocfs2_mv_path(right_path, left_path);
3345                         left_path = NULL;
3346                 } else
3347                         ocfs2_complete_edge_insert(inode, handle, left_path,
3348                                                    right_path, subtree_index);
3349         }
3350 out:
3351         if (left_path)
3352                 ocfs2_free_path(left_path);
3353         return ret;
3354 }
3355
3356 static int ocfs2_try_to_merge_extent(struct inode *inode,
3357                                      handle_t *handle,
3358                                      struct ocfs2_path *path,
3359                                      int split_index,
3360                                      struct ocfs2_extent_rec *split_rec,
3361                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
3362                                      struct ocfs2_merge_ctxt *ctxt,
3363                                      struct ocfs2_extent_tree *et)
3364
3365 {
3366         int ret = 0;
3367         struct ocfs2_extent_list *el = path_leaf_el(path);
3368         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3369
3370         BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3371
3372         if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3373                 /*
3374                  * The merge code will need to create an empty
3375                  * extent to take the place of the newly
3376                  * emptied slot. Remove any pre-existing empty
3377                  * extents - having more than one in a leaf is
3378                  * illegal.
3379                  */
3380                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3381                                              dealloc, et);
3382                 if (ret) {
3383                         mlog_errno(ret);
3384                         goto out;
3385                 }
3386                 split_index--;
3387                 rec = &el->l_recs[split_index];
3388         }
3389
3390         if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3391                 /*
3392                  * Left-right contig implies this.
3393                  */
3394                 BUG_ON(!ctxt->c_split_covers_rec);
3395
3396                 /*
3397                  * Since the leftright insert always covers the entire
3398                  * extent, this call will delete the insert record
3399                  * entirely, resulting in an empty extent record added to
3400                  * the extent block.
3401                  *
3402                  * Since the adding of an empty extent shifts
3403                  * everything back to the right, there's no need to
3404                  * update split_index here.
3405                  *
3406                  * When the split_index is zero, we need to merge it to the
3407                  * prevoius extent block. It is more efficient and easier
3408                  * if we do merge_right first and merge_left later.
3409                  */
3410                 ret = ocfs2_merge_rec_right(inode, path,
3411                                             handle, split_rec,
3412                                             split_index);
3413                 if (ret) {
3414                         mlog_errno(ret);
3415                         goto out;
3416                 }
3417
3418                 /*
3419                  * We can only get this from logic error above.
3420                  */
3421                 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3422
3423                 /* The merge left us with an empty extent, remove it. */
3424                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3425                                              dealloc, et);
3426                 if (ret) {
3427                         mlog_errno(ret);
3428                         goto out;
3429                 }
3430
3431                 rec = &el->l_recs[split_index];
3432
3433                 /*
3434                  * Note that we don't pass split_rec here on purpose -
3435                  * we've merged it into the rec already.
3436                  */
3437                 ret = ocfs2_merge_rec_left(inode, path,
3438                                            handle, rec,
3439                                            dealloc, et,
3440                                            split_index);
3441
3442                 if (ret) {
3443                         mlog_errno(ret);
3444                         goto out;
3445                 }
3446
3447                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3448                                              dealloc, et);
3449                 /*
3450                  * Error from this last rotate is not critical, so
3451                  * print but don't bubble it up.
3452                  */
3453                 if (ret)
3454                         mlog_errno(ret);
3455                 ret = 0;
3456         } else {
3457                 /*
3458                  * Merge a record to the left or right.
3459                  *
3460                  * 'contig_type' is relative to the existing record,
3461                  * so for example, if we're "right contig", it's to
3462                  * the record on the left (hence the left merge).
3463                  */
3464                 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3465                         ret = ocfs2_merge_rec_left(inode,
3466                                                    path,
3467                                                    handle, split_rec,
3468                                                    dealloc, et,
3469                                                    split_index);
3470                         if (ret) {
3471                                 mlog_errno(ret);
3472                                 goto out;
3473                         }
3474                 } else {
3475                         ret = ocfs2_merge_rec_right(inode,
3476                                                     path,
3477                                                     handle, split_rec,
3478                                                     split_index);
3479                         if (ret) {
3480                                 mlog_errno(ret);
3481                                 goto out;
3482                         }
3483                 }
3484
3485                 if (ctxt->c_split_covers_rec) {
3486                         /*
3487                          * The merge may have left an empty extent in
3488                          * our leaf. Try to rotate it away.
3489                          */
3490                         ret = ocfs2_rotate_tree_left(inode, handle, path,
3491                                                      dealloc, et);
3492                         if (ret)
3493                                 mlog_errno(ret);
3494                         ret = 0;
3495                 }
3496         }
3497
3498 out:
3499         return ret;
3500 }
3501
3502 static void ocfs2_subtract_from_rec(struct super_block *sb,
3503                                     enum ocfs2_split_type split,
3504                                     struct ocfs2_extent_rec *rec,
3505                                     struct ocfs2_extent_rec *split_rec)
3506 {
3507         u64 len_blocks;
3508
3509         len_blocks = ocfs2_clusters_to_blocks(sb,
3510                                 le16_to_cpu(split_rec->e_leaf_clusters));
3511
3512         if (split == SPLIT_LEFT) {
3513                 /*
3514                  * Region is on the left edge of the existing
3515                  * record.
3516                  */
3517                 le32_add_cpu(&rec->e_cpos,
3518                              le16_to_cpu(split_rec->e_leaf_clusters));
3519                 le64_add_cpu(&rec->e_blkno, len_blocks);
3520                 le16_add_cpu(&rec->e_leaf_clusters,
3521                              -le16_to_cpu(split_rec->e_leaf_clusters));
3522         } else {
3523                 /*
3524                  * Region is on the right edge of the existing
3525                  * record.
3526                  */
3527                 le16_add_cpu(&rec->e_leaf_clusters,
3528                              -le16_to_cpu(split_rec->e_leaf_clusters));
3529         }
3530 }
3531
3532 /*
3533  * Do the final bits of extent record insertion at the target leaf
3534  * list. If this leaf is part of an allocation tree, it is assumed
3535  * that the tree above has been prepared.
3536  */
3537 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3538                                  struct ocfs2_extent_list *el,
3539                                  struct ocfs2_insert_type *insert,
3540                                  struct inode *inode)
3541 {
3542         int i = insert->ins_contig_index;
3543         unsigned int range;
3544         struct ocfs2_extent_rec *rec;
3545
3546         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3547
3548         if (insert->ins_split != SPLIT_NONE) {
3549                 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3550                 BUG_ON(i == -1);
3551                 rec = &el->l_recs[i];
3552                 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3553                                         insert_rec);
3554                 goto rotate;
3555         }
3556
3557         /*
3558          * Contiguous insert - either left or right.
3559          */
3560         if (insert->ins_contig != CONTIG_NONE) {
3561                 rec = &el->l_recs[i];
3562                 if (insert->ins_contig == CONTIG_LEFT) {
3563                         rec->e_blkno = insert_rec->e_blkno;
3564                         rec->e_cpos = insert_rec->e_cpos;
3565                 }
3566                 le16_add_cpu(&rec->e_leaf_clusters,
3567                              le16_to_cpu(insert_rec->e_leaf_clusters));
3568                 return;
3569         }
3570
3571         /*
3572          * Handle insert into an empty leaf.
3573          */
3574         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3575             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3576              ocfs2_is_empty_extent(&el->l_recs[0]))) {
3577                 el->l_recs[0] = *insert_rec;
3578                 el->l_next_free_rec = cpu_to_le16(1);
3579                 return;
3580         }
3581
3582         /*
3583          * Appending insert.
3584          */
3585         if (insert->ins_appending == APPEND_TAIL) {
3586                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3587                 rec = &el->l_recs[i];
3588                 range = le32_to_cpu(rec->e_cpos)
3589                         + le16_to_cpu(rec->e_leaf_clusters);
3590                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3591
3592                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3593                                 le16_to_cpu(el->l_count),
3594                                 "inode %lu, depth %u, count %u, next free %u, "
3595                                 "rec.cpos %u, rec.clusters %u, "
3596                                 "insert.cpos %u, insert.clusters %u\n",
3597                                 inode->i_ino,
3598                                 le16_to_cpu(el->l_tree_depth),
3599                                 le16_to_cpu(el->l_count),
3600                                 le16_to_cpu(el->l_next_free_rec),
3601                                 le32_to_cpu(el->l_recs[i].e_cpos),
3602                                 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3603                                 le32_to_cpu(insert_rec->e_cpos),
3604                                 le16_to_cpu(insert_rec->e_leaf_clusters));
3605                 i++;
3606                 el->l_recs[i] = *insert_rec;
3607                 le16_add_cpu(&el->l_next_free_rec, 1);
3608                 return;
3609         }
3610
3611 rotate:
3612         /*
3613          * Ok, we have to rotate.
3614          *
3615          * At this point, it is safe to assume that inserting into an
3616          * empty leaf and appending to a leaf have both been handled
3617          * above.
3618          *
3619          * This leaf needs to have space, either by the empty 1st
3620          * extent record, or by virtue of an l_next_rec < l_count.
3621          */
3622         ocfs2_rotate_leaf(el, insert_rec);
3623 }
3624
3625 static void ocfs2_adjust_rightmost_records(struct inode *inode,
3626                                            handle_t *handle,
3627                                            struct ocfs2_path *path,
3628                                            struct ocfs2_extent_rec *insert_rec)
3629 {
3630         int ret, i, next_free;
3631         struct buffer_head *bh;
3632         struct ocfs2_extent_list *el;
3633         struct ocfs2_extent_rec *rec;
3634
3635         /*
3636          * Update everything except the leaf block.
3637          */
3638         for (i = 0; i < path->p_tree_depth; i++) {
3639                 bh = path->p_node[i].bh;
3640                 el = path->p_node[i].el;
3641
3642                 next_free = le16_to_cpu(el->l_next_free_rec);
3643                 if (next_free == 0) {
3644                         ocfs2_error(inode->i_sb,
3645                                     "Dinode %llu has a bad extent list",
3646                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
3647                         ret = -EIO;
3648                         return;
3649                 }
3650
3651                 rec = &el->l_recs[next_free - 1];
3652
3653                 rec->e_int_clusters = insert_rec->e_cpos;
3654                 le32_add_cpu(&rec->e_int_clusters,
3655                              le16_to_cpu(insert_rec->e_leaf_clusters));
3656                 le32_add_cpu(&rec->e_int_clusters,
3657                              -le32_to_cpu(rec->e_cpos));
3658
3659                 ret = ocfs2_journal_dirty(handle, bh);
3660                 if (ret)
3661                         mlog_errno(ret);
3662
3663         }
3664 }
3665
3666 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3667                                     struct ocfs2_extent_rec *insert_rec,
3668                                     struct ocfs2_path *right_path,
3669                                     struct ocfs2_path **ret_left_path)
3670 {
3671         int ret, next_free;
3672         struct ocfs2_extent_list *el;
3673         struct ocfs2_path *left_path = NULL;
3674
3675         *ret_left_path = NULL;
3676
3677         /*
3678          * This shouldn't happen for non-trees. The extent rec cluster
3679          * count manipulation below only works for interior nodes.
3680          */
3681         BUG_ON(right_path->p_tree_depth == 0);
3682
3683         /*
3684          * If our appending insert is at the leftmost edge of a leaf,
3685          * then we might need to update the rightmost records of the
3686          * neighboring path.
3687          */
3688         el = path_leaf_el(right_path);
3689         next_free = le16_to_cpu(el->l_next_free_rec);
3690         if (next_free == 0 ||
3691             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3692                 u32 left_cpos;
3693
3694                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3695                                                     &left_cpos);
3696                 if (ret) {
3697                         mlog_errno(ret);
3698                         goto out;
3699                 }
3700
3701                 mlog(0, "Append may need a left path update. cpos: %u, "
3702                      "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3703                      left_cpos);
3704
3705                 /*
3706                  * No need to worry if the append is already in the
3707                  * leftmost leaf.
3708                  */
3709                 if (left_cpos) {
3710                         left_path = ocfs2_new_path(path_root_bh(right_path),
3711                                                    path_root_el(right_path));
3712                         if (!left_path) {
3713                                 ret = -ENOMEM;
3714                                 mlog_errno(ret);
3715                                 goto out;
3716                         }
3717
3718                         ret = ocfs2_find_path(inode, left_path, left_cpos);
3719                         if (ret) {
3720                                 mlog_errno(ret);
3721                                 goto out;
3722                         }
3723
3724                         /*
3725                          * ocfs2_insert_path() will pass the left_path to the
3726                          * journal for us.
3727                          */
3728                 }
3729         }
3730
3731         ret = ocfs2_journal_access_path(inode, handle, right_path);
3732         if (ret) {
3733                 mlog_errno(ret);
3734                 goto out;
3735         }
3736
3737         ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3738
3739         *ret_left_path = left_path;
3740         ret = 0;
3741 out:
3742         if (ret != 0)
3743                 ocfs2_free_path(left_path);
3744
3745         return ret;
3746 }
3747
3748 static void ocfs2_split_record(struct inode *inode,
3749                                struct ocfs2_path *left_path,
3750                                struct ocfs2_path *right_path,
3751                                struct ocfs2_extent_rec *split_rec,
3752                                enum ocfs2_split_type split)
3753 {
3754         int index;
3755         u32 cpos = le32_to_cpu(split_rec->e_cpos);
3756         struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3757         struct ocfs2_extent_rec *rec, *tmprec;
3758
3759         right_el = path_leaf_el(right_path);;
3760         if (left_path)
3761                 left_el = path_leaf_el(left_path);
3762
3763         el = right_el;
3764         insert_el = right_el;
3765         index = ocfs2_search_extent_list(el, cpos);
3766         if (index != -1) {
3767                 if (index == 0 && left_path) {
3768                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3769
3770                         /*
3771                          * This typically means that the record
3772                          * started in the left path but moved to the
3773                          * right as a result of rotation. We either
3774                          * move the existing record to the left, or we
3775                          * do the later insert there.
3776                          *
3777                          * In this case, the left path should always
3778                          * exist as the rotate code will have passed
3779                          * it back for a post-insert update.
3780                          */
3781
3782                         if (split == SPLIT_LEFT) {
3783                                 /*
3784                                  * It's a left split. Since we know
3785                                  * that the rotate code gave us an
3786                                  * empty extent in the left path, we
3787                                  * can just do the insert there.
3788                                  */
3789                                 insert_el = left_el;
3790                         } else {
3791                                 /*
3792                                  * Right split - we have to move the
3793                                  * existing record over to the left
3794                                  * leaf. The insert will be into the
3795                                  * newly created empty extent in the
3796                                  * right leaf.
3797                                  */
3798                                 tmprec = &right_el->l_recs[index];
3799                                 ocfs2_rotate_leaf(left_el, tmprec);
3800                                 el = left_el;
3801
3802                                 memset(tmprec, 0, sizeof(*tmprec));
3803                                 index = ocfs2_search_extent_list(left_el, cpos);
3804                                 BUG_ON(index == -1);
3805                         }
3806                 }
3807         } else {
3808                 BUG_ON(!left_path);
3809                 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3810                 /*
3811                  * Left path is easy - we can just allow the insert to
3812                  * happen.
3813                  */
3814                 el = left_el;
3815                 insert_el = left_el;
3816                 index = ocfs2_search_extent_list(el, cpos);
3817                 BUG_ON(index == -1);
3818         }
3819
3820         rec = &el->l_recs[index];
3821         ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3822         ocfs2_rotate_leaf(insert_el, split_rec);
3823 }
3824
3825 /*
3826  * This function only does inserts on an allocation b-tree. For tree
3827  * depth = 0, ocfs2_insert_at_leaf() is called directly.
3828  *
3829  * right_path is the path we want to do the actual insert
3830  * in. left_path should only be passed in if we need to update that
3831  * portion of the tree after an edge insert.
3832  */
3833 static int ocfs2_insert_path(struct inode *inode,
3834                              handle_t *handle,
3835                              struct ocfs2_path *left_path,
3836                              struct ocfs2_path *right_path,
3837                              struct ocfs2_extent_rec *insert_rec,
3838                              struct ocfs2_insert_type *insert)
3839 {
3840         int ret, subtree_index;
3841         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3842
3843         if (left_path) {
3844                 int credits = handle->h_buffer_credits;
3845
3846                 /*
3847                  * There's a chance that left_path got passed back to
3848                  * us without being accounted for in the
3849                  * journal. Extend our transaction here to be sure we
3850                  * can change those blocks.
3851                  */
3852                 credits += left_path->p_tree_depth;
3853
3854                 ret = ocfs2_extend_trans(handle, credits);
3855                 if (ret < 0) {
3856                         mlog_errno(ret);
3857                         goto out;
3858                 }
3859
3860                 ret = ocfs2_journal_access_path(inode, handle, left_path);
3861                 if (ret < 0) {
3862                         mlog_errno(ret);
3863                         goto out;
3864                 }
3865         }
3866
3867         /*
3868          * Pass both paths to the journal. The majority of inserts
3869          * will be touching all components anyway.
3870          */
3871         ret = ocfs2_journal_access_path(inode, handle, right_path);
3872         if (ret < 0) {
3873                 mlog_errno(ret);
3874                 goto out;
3875         }
3876
3877         if (insert->ins_split != SPLIT_NONE) {
3878                 /*
3879                  * We could call ocfs2_insert_at_leaf() for some types
3880                  * of splits, but it's easier to just let one separate
3881                  * function sort it all out.
3882                  */
3883                 ocfs2_split_record(inode, left_path, right_path,
3884                                    insert_rec, insert->ins_split);
3885
3886                 /*
3887                  * Split might have modified either leaf and we don't
3888                  * have a guarantee that the later edge insert will
3889                  * dirty this for us.
3890                  */
3891                 if (left_path)
3892                         ret = ocfs2_journal_dirty(handle,
3893                                                   path_leaf_bh(left_path));
3894                         if (ret)
3895                                 mlog_errno(ret);
3896         } else
3897                 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3898                                      insert, inode);
3899
3900         ret = ocfs2_journal_dirty(handle, leaf_bh);
3901         if (ret)
3902                 mlog_errno(ret);
3903
3904         if (left_path) {
3905                 /*
3906                  * The rotate code has indicated that we need to fix
3907                  * up portions of the tree after the insert.
3908                  *
3909                  * XXX: Should we extend the transaction here?
3910                  */
3911                 subtree_index = ocfs2_find_subtree_root(inode, left_path,
3912                                                         right_path);
3913                 ocfs2_complete_edge_insert(inode, handle, left_path,
3914                                            right_path, subtree_index);
3915         }
3916
3917         ret = 0;
3918 out:
3919         return ret;
3920 }
3921
3922 static int ocfs2_do_insert_extent(struct inode *inode,
3923                                   handle_t *handle,
3924                                   struct ocfs2_extent_tree *et,
3925                                   struct ocfs2_extent_rec *insert_rec,
3926                                   struct ocfs2_insert_type *type)
3927 {
3928         int ret, rotate = 0;
3929         u32 cpos;
3930         struct ocfs2_path *right_path = NULL;
3931         struct ocfs2_path *left_path = NULL;
3932         struct ocfs2_extent_list *el;
3933
3934         el = et->et_root_el;
3935
3936         ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
3937                                    OCFS2_JOURNAL_ACCESS_WRITE);
3938         if (ret) {
3939                 mlog_errno(ret);
3940                 goto out;
3941         }
3942
3943         if (le16_to_cpu(el->l_tree_depth) == 0) {
3944                 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
3945                 goto out_update_clusters;
3946         }
3947
3948         right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
3949         if (!right_path) {
3950                 ret = -ENOMEM;
3951                 mlog_errno(ret);
3952                 goto out;
3953         }
3954
3955         /*
3956          * Determine the path to start with. Rotations need the
3957          * rightmost path, everything else can go directly to the
3958          * target leaf.
3959          */
3960         cpos = le32_to_cpu(insert_rec->e_cpos);
3961         if (type->ins_appending == APPEND_NONE &&
3962             type->ins_contig == CONTIG_NONE) {
3963                 rotate = 1;
3964                 cpos = UINT_MAX;
3965         }
3966
3967         ret = ocfs2_find_path(inode, right_path, cpos);
3968         if (ret) {
3969                 mlog_errno(ret);
3970                 goto out;
3971         }
3972
3973         /*
3974          * Rotations and appends need special treatment - they modify
3975          * parts of the tree's above them.
3976          *
3977          * Both might pass back a path immediate to the left of the
3978          * one being inserted to. This will be cause
3979          * ocfs2_insert_path() to modify the rightmost records of
3980          * left_path to account for an edge insert.
3981          *
3982          * XXX: When modifying this code, keep in mind that an insert
3983          * can wind up skipping both of these two special cases...
3984          */
3985         if (rotate) {
3986                 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
3987                                               le32_to_cpu(insert_rec->e_cpos),
3988                                               right_path, &left_path);
3989                 if (ret) {
3990                         mlog_errno(ret);
3991                         goto out;
3992                 }
3993
3994                 /*
3995                  * ocfs2_rotate_tree_right() might have extended the
3996                  * transaction without re-journaling our tree root.
3997                  */
3998                 ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
3999                                            OCFS2_JOURNAL_ACCESS_WRITE);
4000                 if (ret) {
4001                         mlog_errno(ret);
4002                         goto out;
4003                 }
4004         } else if (type->ins_appending == APPEND_TAIL
4005                    && type->ins_contig != CONTIG_LEFT) {
4006                 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
4007                                                right_path, &left_path);
4008                 if (ret) {
4009                         mlog_errno(ret);
4010                         goto out;
4011                 }
4012         }
4013
4014         ret = ocfs2_insert_path(inode, handle, left_path, right_path,
4015                                 insert_rec, type);
4016         if (ret) {
4017                 mlog_errno(ret);
4018                 goto out;
4019         }
4020
4021 out_update_clusters:
4022         if (type->ins_split == SPLIT_NONE)
4023                 ocfs2_et_update_clusters(inode, et,
4024                                          le16_to_cpu(insert_rec->e_leaf_clusters));
4025
4026         ret = ocfs2_journal_dirty(handle, et->et_root_bh);
4027         if (ret)
4028                 mlog_errno(ret);
4029
4030 out:
4031         ocfs2_free_path(left_path);
4032         ocfs2_free_path(right_path);
4033
4034         return ret;
4035 }
4036
4037 static enum ocfs2_contig_type
4038 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4039                                struct ocfs2_extent_list *el, int index,
4040                                struct ocfs2_extent_rec *split_rec)
4041 {
4042         int status;
4043         enum ocfs2_contig_type ret = CONTIG_NONE;
4044         u32 left_cpos, right_cpos;
4045         struct ocfs2_extent_rec *rec = NULL;
4046         struct ocfs2_extent_list *new_el;
4047         struct ocfs2_path *left_path = NULL, *right_path = NULL;
4048         struct buffer_head *bh;
4049         struct ocfs2_extent_block *eb;
4050
4051         if (index > 0) {
4052                 rec = &el->l_recs[index - 1];
4053         } else if (path->p_tree_depth > 0) {
4054                 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
4055                                                        path, &left_cpos);
4056                 if (status)
4057                         goto out;
4058
4059                 if (left_cpos != 0) {
4060                         left_path = ocfs2_new_path(path_root_bh(path),
4061                                                    path_root_el(path));
4062                         if (!left_path)
4063                                 goto out;
4064
4065                         status = ocfs2_find_path(inode, left_path, left_cpos);
4066                         if (status)
4067                                 goto out;
4068
4069                         new_el = path_leaf_el(left_path);
4070
4071                         if (le16_to_cpu(new_el->l_next_free_rec) !=
4072                             le16_to_cpu(new_el->l_count)) {
4073                                 bh = path_leaf_bh(left_path);
4074                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4075                                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
4076                                                                  eb);
4077                                 goto out;
4078                         }
4079                         rec = &new_el->l_recs[
4080                                 le16_to_cpu(new_el->l_next_free_rec) - 1];
4081                 }
4082         }
4083
4084         /*
4085          * We're careful to check for an empty extent record here -
4086          * the merge code will know what to do if it sees one.
4087          */
4088         if (rec) {
4089                 if (index == 1 && ocfs2_is_empty_extent(rec)) {
4090                         if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4091                                 ret = CONTIG_RIGHT;
4092                 } else {
4093                         ret = ocfs2_extent_contig(inode, rec, split_rec);
4094                 }
4095         }
4096
4097         rec = NULL;
4098         if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4099                 rec = &el->l_recs[index + 1];
4100         else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4101                  path->p_tree_depth > 0) {
4102                 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
4103                                                         path, &right_cpos);
4104                 if (status)
4105                         goto out;
4106
4107                 if (right_cpos == 0)
4108                         goto out;
4109
4110                 right_path = ocfs2_new_path(path_root_bh(path),
4111                                             path_root_el(path));
4112                 if (!right_path)
4113                         goto out;
4114
4115                 status = ocfs2_find_path(inode, right_path, right_cpos);
4116                 if (status)
4117                         goto out;
4118
4119                 new_el = path_leaf_el(right_path);
4120                 rec = &new_el->l_recs[0];
4121                 if (ocfs2_is_empty_extent(rec)) {
4122                         if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4123                                 bh = path_leaf_bh(right_path);
4124                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4125                                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
4126                                                                  eb);
4127                                 goto out;
4128                         }
4129                         rec = &new_el->l_recs[1];
4130                 }
4131         }
4132
4133         if (rec) {
4134                 enum ocfs2_contig_type contig_type;
4135
4136                 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
4137
4138                 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4139                         ret = CONTIG_LEFTRIGHT;
4140                 else if (ret == CONTIG_NONE)
4141                         ret = contig_type;
4142         }
4143
4144 out:
4145         if (left_path)
4146                 ocfs2_free_path(left_path);
4147         if (right_path)
4148                 ocfs2_free_path(right_path);
4149
4150         return ret;
4151 }
4152
4153 static void ocfs2_figure_contig_type(struct inode *inode,
4154                                      struct ocfs2_insert_type *insert,
4155                                      struct ocfs2_extent_list *el,
4156                                      struct ocfs2_extent_rec *insert_rec,
4157                                      struct ocfs2_extent_tree *et)
4158 {
4159         int i;
4160         enum ocfs2_contig_type contig_type = CONTIG_NONE;
4161
4162         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4163
4164         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4165                 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
4166                                                   insert_rec);
4167                 if (contig_type != CONTIG_NONE) {
4168                         insert->ins_contig_index = i;
4169                         break;
4170                 }
4171         }
4172         insert->ins_contig = contig_type;
4173
4174         if (insert->ins_contig != CONTIG_NONE) {
4175                 struct ocfs2_extent_rec *rec =
4176                                 &el->l_recs[insert->ins_contig_index];
4177                 unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4178                                    le16_to_cpu(insert_rec->e_leaf_clusters);
4179
4180                 /*
4181                  * Caller might want us to limit the size of extents, don't
4182                  * calculate contiguousness if we might exceed that limit.
4183                  */
4184                 if (et->et_max_leaf_clusters &&
4185                     (len > et->et_max_leaf_clusters))
4186                         insert->ins_contig = CONTIG_NONE;
4187         }
4188 }
4189
4190 /*
4191  * This should only be called against the righmost leaf extent list.
4192  *
4193  * ocfs2_figure_appending_type() will figure out whether we'll have to
4194  * insert at the tail of the rightmost leaf.
4195  *
4196  * This should also work against the root extent list for tree's with 0
4197  * depth. If we consider the root extent list to be the rightmost leaf node
4198  * then the logic here makes sense.
4199  */
4200 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4201                                         struct ocfs2_extent_list *el,
4202                                         struct ocfs2_extent_rec *insert_rec)
4203 {
4204         int i;
4205         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4206         struct ocfs2_extent_rec *rec;
4207
4208         insert->ins_appending = APPEND_NONE;
4209
4210         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4211
4212         if (!el->l_next_free_rec)
4213                 goto set_tail_append;
4214
4215         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4216                 /* Were all records empty? */
4217                 if (le16_to_cpu(el->l_next_free_rec) == 1)
4218                         goto set_tail_append;
4219         }
4220
4221         i = le16_to_cpu(el->l_next_free_rec) - 1;
4222         rec = &el->l_recs[i];
4223
4224         if (cpos >=
4225             (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4226                 goto set_tail_append;
4227
4228         return;
4229
4230 set_tail_append:
4231         insert->ins_appending = APPEND_TAIL;
4232 }
4233
4234 /*
4235  * Helper function called at the begining of an insert.
4236  *
4237  * This computes a few things that are commonly used in the process of
4238  * inserting into the btree:
4239  *   - Whether the new extent is contiguous with an existing one.
4240  *   - The current tree depth.
4241  *   - Whether the insert is an appending one.
4242  *   - The total # of free records in the tree.
4243  *
4244  * All of the information is stored on the ocfs2_insert_type
4245  * structure.
4246  */
4247 static int ocfs2_figure_insert_type(struct inode *inode,
4248                                     struct ocfs2_extent_tree *et,
4249                                     struct buffer_head **last_eb_bh,
4250                                     struct ocfs2_extent_rec *insert_rec,
4251                                     int *free_records,
4252                                     struct ocfs2_insert_type *insert)
4253 {
4254         int ret;
4255         struct ocfs2_extent_block *eb;
4256         struct ocfs2_extent_list *el;
4257         struct ocfs2_path *path = NULL;
4258         struct buffer_head *bh = NULL;
4259
4260         insert->ins_split = SPLIT_NONE;
4261
4262         el = et->et_root_el;
4263         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4264
4265         if (el->l_tree_depth) {
4266                 /*
4267                  * If we have tree depth, we read in the
4268                  * rightmost extent block ahead of time as
4269                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
4270                  * may want it later.
4271                  */
4272                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4273                                        ocfs2_et_get_last_eb_blk(et), &bh,
4274                                        OCFS2_BH_CACHED, inode);
4275                 if (ret) {
4276                         mlog_exit(ret);
4277                         goto out;
4278                 }
4279                 eb = (struct ocfs2_extent_block *) bh->b_data;
4280                 el = &eb->h_list;
4281         }
4282
4283         /*
4284          * Unless we have a contiguous insert, we'll need to know if
4285          * there is room left in our allocation tree for another
4286          * extent record.
4287          *
4288          * XXX: This test is simplistic, we can search for empty
4289          * extent records too.
4290          */
4291         *free_records = le16_to_cpu(el->l_count) -
4292                 le16_to_cpu(el->l_next_free_rec);
4293
4294         if (!insert->ins_tree_depth) {
4295                 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4296                 ocfs2_figure_appending_type(insert, el, insert_rec);
4297                 return 0;
4298         }
4299
4300         path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4301         if (!path) {
4302                 ret = -ENOMEM;
4303                 mlog_errno(ret);
4304                 goto out;
4305         }
4306
4307         /*
4308          * In the case that we're inserting past what the tree
4309          * currently accounts for, ocfs2_find_path() will return for
4310          * us the rightmost tree path. This is accounted for below in
4311          * the appending code.
4312          */
4313         ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4314         if (ret) {
4315                 mlog_errno(ret);
4316                 goto out;
4317         }
4318
4319         el = path_leaf_el(path);
4320
4321         /*
4322          * Now that we have the path, there's two things we want to determine:
4323          * 1) Contiguousness (also set contig_index if this is so)
4324          *
4325          * 2) Are we doing an append? We can trivially break this up
4326          *     into two types of appends: simple record append, or a
4327          *     rotate inside the tail leaf.
4328          */
4329         ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4330
4331         /*
4332          * The insert code isn't quite ready to deal with all cases of
4333          * left contiguousness. Specifically, if it's an insert into
4334          * the 1st record in a leaf, it will require the adjustment of
4335          * cluster count on the last record of the path directly to it's
4336          * left. For now, just catch that case and fool the layers
4337          * above us. This works just fine for tree_depth == 0, which
4338          * is why we allow that above.
4339          */
4340         if (insert->ins_contig == CONTIG_LEFT &&
4341             insert->ins_contig_index == 0)
4342                 insert->ins_contig = CONTIG_NONE;
4343
4344         /*
4345          * Ok, so we can simply compare against last_eb to figure out
4346          * whether the path doesn't exist. This will only happen in
4347          * the case that we're doing a tail append, so maybe we can
4348          * take advantage of that information somehow.
4349          */
4350         if (ocfs2_et_get_last_eb_blk(et) ==
4351             path_leaf_bh(path)->b_blocknr) {
4352                 /*
4353                  * Ok, ocfs2_find_path() returned us the rightmost
4354                  * tree path. This might be an appending insert. There are
4355                  * two cases:
4356                  *    1) We're doing a true append at the tail:
4357                  *      -This might even be off the end of the leaf
4358                  *    2) We're "appending" by rotating in the tail
4359                  */
4360                 ocfs2_figure_appending_type(insert, el, insert_rec);
4361         }
4362
4363 out:
4364         ocfs2_free_path(path);
4365
4366         if (ret == 0)
4367                 *last_eb_bh = bh;
4368         else
4369                 brelse(bh);
4370         return ret;
4371 }
4372
4373 /*
4374  * Insert an extent into an inode btree.
4375  *
4376  * The caller needs to update fe->i_clusters
4377  */
4378 int ocfs2_insert_extent(struct ocfs2_super *osb,
4379                         handle_t *handle,
4380                         struct inode *inode,
4381                         struct ocfs2_extent_tree *et,
4382                         u32 cpos,
4383                         u64 start_blk,
4384                         u32 new_clusters,
4385                         u8 flags,
4386                         struct ocfs2_alloc_context *meta_ac)
4387 {
4388         int status;
4389         int uninitialized_var(free_records);
4390         struct buffer_head *last_eb_bh = NULL;
4391         struct ocfs2_insert_type insert = {0, };
4392         struct ocfs2_extent_rec rec;
4393
4394         mlog(0, "add %u clusters at position %u to inode %llu\n",
4395              new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4396
4397         memset(&rec, 0, sizeof(rec));
4398         rec.e_cpos = cpu_to_le32(cpos);
4399         rec.e_blkno = cpu_to_le64(start_blk);
4400         rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4401         rec.e_flags = flags;
4402         status = ocfs2_et_insert_check(inode, et, &rec);
4403         if (status) {
4404                 mlog_errno(status);
4405                 goto bail;
4406         }
4407
4408         status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
4409                                           &free_records, &insert);
4410         if (status < 0) {
4411                 mlog_errno(status);
4412                 goto bail;
4413         }
4414
4415         mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4416              "Insert.contig_index: %d, Insert.free_records: %d, "
4417              "Insert.tree_depth: %d\n",
4418              insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4419              free_records, insert.ins_tree_depth);
4420
4421         if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4422                 status = ocfs2_grow_tree(inode, handle, et,
4423                                          &insert.ins_tree_depth, &last_eb_bh,
4424                                          meta_ac);
4425                 if (status) {
4426                         mlog_errno(status);
4427                         goto bail;
4428                 }
4429         }
4430
4431         /* Finally, we can add clusters. This might rotate the tree for us. */
4432         status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
4433         if (status < 0)
4434                 mlog_errno(status);
4435         else if (et->et_ops == &ocfs2_dinode_et_ops)
4436                 ocfs2_extent_map_insert_rec(inode, &rec);
4437
4438 bail:
4439         if (last_eb_bh)
4440                 brelse(last_eb_bh);
4441
4442         mlog_exit(status);
4443         return status;
4444 }
4445
4446 /*
4447  * Allcate and add clusters into the extent b-tree.
4448  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4449  * The extent b-tree's root is specified by et, and
4450  * it is not limited to the file storage. Any extent tree can use this
4451  * function if it implements the proper ocfs2_extent_tree.
4452  */
4453 int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4454                                 struct inode *inode,
4455                                 u32 *logical_offset,
4456                                 u32 clusters_to_add,
4457                                 int mark_unwritten,
4458                                 struct ocfs2_extent_tree *et,
4459                                 handle_t *handle,
4460                                 struct ocfs2_alloc_context *data_ac,
4461                                 struct ocfs2_alloc_context *meta_ac,
4462                                 enum ocfs2_alloc_restarted *reason_ret)
4463 {
4464         int status = 0;
4465         int free_extents;
4466         enum ocfs2_alloc_restarted reason = RESTART_NONE;
4467         u32 bit_off, num_bits;
4468         u64 block;
4469         u8 flags = 0;
4470
4471         BUG_ON(!clusters_to_add);
4472
4473         if (mark_unwritten)
4474                 flags = OCFS2_EXT_UNWRITTEN;
4475
4476         free_extents = ocfs2_num_free_extents(osb, inode, et);
4477         if (free_extents < 0) {
4478                 status = free_extents;
4479                 mlog_errno(status);
4480                 goto leave;
4481         }
4482
4483         /* there are two cases which could cause us to EAGAIN in the
4484          * we-need-more-metadata case:
4485          * 1) we haven't reserved *any*
4486          * 2) we are so fragmented, we've needed to add metadata too
4487          *    many times. */
4488         if (!free_extents && !meta_ac) {
4489                 mlog(0, "we haven't reserved any metadata!\n");
4490                 status = -EAGAIN;
4491                 reason = RESTART_META;
4492                 goto leave;
4493         } else if ((!free_extents)
4494                    && (ocfs2_alloc_context_bits_left(meta_ac)
4495                        < ocfs2_extend_meta_needed(et->et_root_el))) {
4496                 mlog(0, "filesystem is really fragmented...\n");
4497                 status = -EAGAIN;
4498                 reason = RESTART_META;
4499                 goto leave;
4500         }
4501
4502         status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4503                                         clusters_to_add, &bit_off, &num_bits);
4504         if (status < 0) {
4505                 if (status != -ENOSPC)
4506                         mlog_errno(status);
4507                 goto leave;
4508         }
4509
4510         BUG_ON(num_bits > clusters_to_add);
4511
4512         /* reserve our write early -- insert_extent may update the inode */
4513         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
4514                                       OCFS2_JOURNAL_ACCESS_WRITE);
4515         if (status < 0) {
4516                 mlog_errno(status);
4517                 goto leave;
4518         }
4519
4520         block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4521         mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
4522              num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4523         status = ocfs2_insert_extent(osb, handle, inode, et,
4524                                      *logical_offset, block,
4525                                      num_bits, flags, meta_ac);
4526         if (status < 0) {
4527                 mlog_errno(status);
4528                 goto leave;
4529         }
4530
4531         status = ocfs2_journal_dirty(handle, et->et_root_bh);
4532         if (status < 0) {
4533                 mlog_errno(status);
4534                 goto leave;
4535         }
4536
4537         clusters_to_add -= num_bits;
4538         *logical_offset += num_bits;
4539
4540         if (clusters_to_add) {
4541                 mlog(0, "need to alloc once more, wanted = %u\n",
4542                      clusters_to_add);
4543                 status = -EAGAIN;
4544                 reason = RESTART_TRANS;
4545         }
4546
4547 leave:
4548         mlog_exit(status);
4549         if (reason_ret)
4550                 *reason_ret = reason;
4551         return status;
4552 }
4553
4554 static void ocfs2_make_right_split_rec(struct super_block *sb,
4555                                        struct ocfs2_extent_rec *split_rec,
4556                                        u32 cpos,
4557                                        struct ocfs2_extent_rec *rec)
4558 {
4559         u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4560         u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4561
4562         memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4563
4564         split_rec->e_cpos = cpu_to_le32(cpos);
4565         split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4566
4567         split_rec->e_blkno = rec->e_blkno;
4568         le64_add_cpu(&split_rec->e_blkno,
4569                      ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4570
4571         split_rec->e_flags = rec->e_flags;
4572 }
4573
4574 static int ocfs2_split_and_insert(struct inode *inode,
4575                                   handle_t *handle,
4576                                   struct ocfs2_path *path,
4577                                   struct ocfs2_extent_tree *et,
4578                                   struct buffer_head **last_eb_bh,
4579                                   int split_index,
4580                                   struct ocfs2_extent_rec *orig_split_rec,
4581                                   struct ocfs2_alloc_context *meta_ac)
4582 {
4583         int ret = 0, depth;
4584         unsigned int insert_range, rec_range, do_leftright = 0;
4585         struct ocfs2_extent_rec tmprec;
4586         struct ocfs2_extent_list *rightmost_el;
4587         struct ocfs2_extent_rec rec;
4588         struct ocfs2_extent_rec split_rec = *orig_split_rec;
4589         struct ocfs2_insert_type insert;
4590         struct ocfs2_extent_block *eb;
4591
4592 leftright:
4593         /*
4594          * Store a copy of the record on the stack - it might move
4595          * around as the tree is manipulated below.
4596          */
4597         rec = path_leaf_el(path)->l_recs[split_index];
4598
4599         rightmost_el = et->et_root_el;
4600
4601         depth = le16_to_cpu(rightmost_el->l_tree_depth);
4602         if (depth) {
4603                 BUG_ON(!(*last_eb_bh));
4604                 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4605                 rightmost_el = &eb->h_list;
4606         }
4607
4608         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4609             le16_to_cpu(rightmost_el->l_count)) {
4610                 ret = ocfs2_grow_tree(inode, handle, et,
4611                                       &depth, last_eb_bh, meta_ac);
4612                 if (ret) {
4613                         mlog_errno(ret);
4614                         goto out;
4615                 }
4616         }
4617
4618         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4619         insert.ins_appending = APPEND_NONE;
4620         insert.ins_contig = CONTIG_NONE;
4621         insert.ins_tree_depth = depth;
4622
4623         insert_range = le32_to_cpu(split_rec.e_cpos) +
4624                 le16_to_cpu(split_rec.e_leaf_clusters);
4625         rec_range = le32_to_cpu(rec.e_cpos) +
4626                 le16_to_cpu(rec.e_leaf_clusters);
4627
4628         if (split_rec.e_cpos == rec.e_cpos) {
4629                 insert.ins_split = SPLIT_LEFT;
4630         } else if (insert_range == rec_range) {
4631                 insert.ins_split = SPLIT_RIGHT;
4632         } else {
4633                 /*
4634                  * Left/right split. We fake this as a right split
4635                  * first and then make a second pass as a left split.
4636                  */
4637                 insert.ins_split = SPLIT_RIGHT;
4638
4639                 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4640                                            &rec);
4641
4642                 split_rec = tmprec;
4643
4644                 BUG_ON(do_leftright);
4645                 do_leftright = 1;
4646         }
4647
4648         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4649         if (ret) {
4650                 mlog_errno(ret);
4651                 goto out;
4652         }
4653
4654         if (do_leftright == 1) {
4655                 u32 cpos;
4656                 struct ocfs2_extent_list *el;
4657
4658                 do_leftright++;
4659                 split_rec = *orig_split_rec;
4660
4661                 ocfs2_reinit_path(path, 1);
4662
4663                 cpos = le32_to_cpu(split_rec.e_cpos);
4664                 ret = ocfs2_find_path(inode, path, cpos);
4665                 if (ret) {
4666                         mlog_errno(ret);
4667                         goto out;
4668                 }
4669
4670                 el = path_leaf_el(path);
4671                 split_index = ocfs2_search_extent_list(el, cpos);
4672                 goto leftright;
4673         }
4674 out:
4675
4676         return ret;
4677 }
4678
4679 /*
4680  * Mark part or all of the extent record at split_index in the leaf
4681  * pointed to by path as written. This removes the unwritten
4682  * extent flag.
4683  *
4684  * Care is taken to handle contiguousness so as to not grow the tree.
4685  *
4686  * meta_ac is not strictly necessary - we only truly need it if growth
4687  * of the tree is required. All other cases will degrade into a less
4688  * optimal tree layout.
4689  *
4690  * last_eb_bh should be the rightmost leaf block for any extent
4691  * btree. Since a split may grow the tree or a merge might shrink it,
4692  * the caller cannot trust the contents of that buffer after this call.
4693  *
4694  * This code is optimized for readability - several passes might be
4695  * made over certain portions of the tree. All of those blocks will
4696  * have been brought into cache (and pinned via the journal), so the
4697  * extra overhead is not expressed in terms of disk reads.
4698  */
4699 static int __ocfs2_mark_extent_written(struct inode *inode,
4700                                        struct ocfs2_extent_tree *et,
4701                                        handle_t *handle,
4702                                        struct ocfs2_path *path,
4703                                        int split_index,
4704                                        struct ocfs2_extent_rec *split_rec,
4705                                        struct ocfs2_alloc_context *meta_ac,
4706                                        struct ocfs2_cached_dealloc_ctxt *dealloc)
4707 {
4708         int ret = 0;
4709         struct ocfs2_extent_list *el = path_leaf_el(path);
4710         struct buffer_head *last_eb_bh = NULL;
4711         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
4712         struct ocfs2_merge_ctxt ctxt;
4713         struct ocfs2_extent_list *rightmost_el;
4714
4715         if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
4716                 ret = -EIO;
4717                 mlog_errno(ret);
4718                 goto out;
4719         }
4720
4721         if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
4722             ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
4723              (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
4724                 ret = -EIO;
4725                 mlog_errno(ret);
4726                 goto out;
4727         }
4728
4729         ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
4730                                                             split_index,
4731                                                             split_rec);
4732
4733         /*
4734          * The core merge / split code wants to know how much room is
4735          * left in this inodes allocation tree, so we pass the
4736          * rightmost extent list.
4737          */
4738         if (path->p_tree_depth) {
4739                 struct ocfs2_extent_block *eb;
4740
4741                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4742                                        ocfs2_et_get_last_eb_blk(et),
4743                                        &last_eb_bh, OCFS2_BH_CACHED, inode);
4744                 if (ret) {
4745                         mlog_exit(ret);
4746                         goto out;
4747                 }
4748
4749                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4750                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4751                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4752                         ret = -EROFS;
4753                         goto out;
4754                 }
4755
4756                 rightmost_el = &eb->h_list;
4757         } else
4758                 rightmost_el = path_root_el(path);
4759
4760         if (rec->e_cpos == split_rec->e_cpos &&
4761             rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4762                 ctxt.c_split_covers_rec = 1;
4763         else
4764                 ctxt.c_split_covers_rec = 0;
4765
4766         ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4767
4768         mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4769              split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4770              ctxt.c_split_covers_rec);
4771
4772         if (ctxt.c_contig_type == CONTIG_NONE) {
4773                 if (ctxt.c_split_covers_rec)
4774                         el->l_recs[split_index] = *split_rec;
4775                 else
4776                         ret = ocfs2_split_and_insert(inode, handle, path, et,
4777                                                      &last_eb_bh, split_index,
4778                                                      split_rec, meta_ac);
4779                 if (ret)
4780                         mlog_errno(ret);
4781         } else {
4782                 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4783                                                 split_index, split_rec,
4784                                                 dealloc, &ctxt, et);
4785                 if (ret)
4786                         mlog_errno(ret);
4787         }
4788
4789 out:
4790         brelse(last_eb_bh);
4791         return ret;
4792 }
4793
4794 /*
4795  * Mark the already-existing extent at cpos as written for len clusters.
4796  *
4797  * If the existing extent is larger than the request, initiate a
4798  * split. An attempt will be made at merging with adjacent extents.
4799  *
4800  * The caller is responsible for passing down meta_ac if we'll need it.
4801  */
4802 int ocfs2_mark_extent_written(struct inode *inode,
4803                               struct ocfs2_extent_tree *et,
4804                               handle_t *handle, u32 cpos, u32 len, u32 phys,
4805                               struct ocfs2_alloc_context *meta_ac,
4806                               struct ocfs2_cached_dealloc_ctxt *dealloc)
4807 {
4808         int ret, index;
4809         u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4810         struct ocfs2_extent_rec split_rec;
4811         struct ocfs2_path *left_path = NULL;
4812         struct ocfs2_extent_list *el;
4813
4814         mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4815              inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4816
4817         if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4818                 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4819                             "that are being written to, but the feature bit "
4820                             "is not set in the super block.",
4821                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
4822                 ret = -EROFS;
4823                 goto out;
4824         }
4825
4826         /*
4827          * XXX: This should be fixed up so that we just re-insert the
4828          * next extent records.
4829          *
4830          * XXX: This is a hack on the extent tree, maybe it should be
4831          * an op?
4832          */
4833         if (et->et_ops == &ocfs2_dinode_et_ops)
4834                 ocfs2_extent_map_trunc(inode, 0);
4835
4836         left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4837         if (!left_path) {
4838                 ret = -ENOMEM;
4839                 mlog_errno(ret);
4840                 goto out;
4841         }
4842
4843         ret = ocfs2_find_path(inode, left_path, cpos);
4844         if (ret) {
4845                 mlog_errno(ret);
4846                 goto out;
4847         }
4848         el = path_leaf_el(left_path);
4849
4850         index = ocfs2_search_extent_list(el, cpos);
4851         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4852                 ocfs2_error(inode->i_sb,
4853                             "Inode %llu has an extent at cpos %u which can no "
4854                             "longer be found.\n",
4855                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4856                 ret = -EROFS;
4857                 goto out;
4858         }
4859
4860         memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4861         split_rec.e_cpos = cpu_to_le32(cpos);
4862         split_rec.e_leaf_clusters = cpu_to_le16(len);
4863         split_rec.e_blkno = cpu_to_le64(start_blkno);
4864         split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4865         split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4866
4867         ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
4868                                           index, &split_rec, meta_ac,
4869                                           dealloc);
4870         if (ret)
4871                 mlog_errno(ret);
4872
4873 out:
4874         ocfs2_free_path(left_path);
4875         return ret;
4876 }
4877
4878 static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
4879                             handle_t *handle, struct ocfs2_path *path,
4880                             int index, u32 new_range,
4881                             struct ocfs2_alloc_context *meta_ac)
4882 {
4883         int ret, depth, credits = handle->h_buffer_credits;
4884         struct buffer_head *last_eb_bh = NULL;
4885         struct ocfs2_extent_block *eb;
4886         struct ocfs2_extent_list *rightmost_el, *el;
4887         struct ocfs2_extent_rec split_rec;
4888         struct ocfs2_extent_rec *rec;
4889         struct ocfs2_insert_type insert;
4890
4891         /*
4892          * Setup the record to split before we grow the tree.
4893          */
4894         el = path_leaf_el(path);
4895         rec = &el->l_recs[index];
4896         ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4897
4898         depth = path->p_tree_depth;
4899         if (depth > 0) {
4900                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4901                                        ocfs2_et_get_last_eb_blk(et),
4902                                        &last_eb_bh, OCFS2_BH_CACHED, inode);
4903                 if (ret < 0) {
4904                         mlog_errno(ret);
4905                         goto out;
4906                 }
4907
4908                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4909                 rightmost_el = &eb->h_list;
4910         } else
4911                 rightmost_el = path_leaf_el(path);
4912
4913         credits += path->p_tree_depth +
4914                    ocfs2_extend_meta_needed(et->et_root_el);
4915         ret = ocfs2_extend_trans(handle, credits);
4916         if (ret) {
4917                 mlog_errno(ret);
4918                 goto out;
4919         }
4920
4921         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4922             le16_to_cpu(rightmost_el->l_count)) {
4923                 ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
4924                                       meta_ac);
4925                 if (ret) {
4926                         mlog_errno(ret);
4927                         goto out;
4928                 }
4929         }
4930
4931         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4932         insert.ins_appending = APPEND_NONE;
4933         insert.ins_contig = CONTIG_NONE;
4934         insert.ins_split = SPLIT_RIGHT;
4935         insert.ins_tree_depth = depth;
4936
4937         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4938         if (ret)
4939                 mlog_errno(ret);
4940
4941 out:
4942         brelse(last_eb_bh);
4943         return ret;
4944 }
4945
4946 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4947                               struct ocfs2_path *path, int index,
4948                               struct ocfs2_cached_dealloc_ctxt *dealloc,
4949                               u32 cpos, u32 len,
4950                               struct ocfs2_extent_tree *et)
4951 {
4952         int ret;
4953         u32 left_cpos, rec_range, trunc_range;
4954         int wants_rotate = 0, is_rightmost_tree_rec = 0;
4955         struct super_block *sb = inode->i_sb;
4956         struct ocfs2_path *left_path = NULL;
4957         struct ocfs2_extent_list *el = path_leaf_el(path);
4958         struct ocfs2_extent_rec *rec;
4959         struct ocfs2_extent_block *eb;
4960
4961         if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
4962                 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
4963                 if (ret) {
4964                         mlog_errno(ret);
4965                         goto out;
4966                 }
4967
4968                 index--;
4969         }
4970
4971         if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
4972             path->p_tree_depth) {
4973                 /*
4974                  * Check whether this is the rightmost tree record. If
4975                  * we remove all of this record or part of its right
4976                  * edge then an update of the record lengths above it
4977                  * will be required.
4978                  */
4979                 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
4980                 if (eb->h_next_leaf_blk == 0)
4981                         is_rightmost_tree_rec = 1;
4982         }
4983
4984         rec = &el->l_recs[index];
4985         if (index == 0 && path->p_tree_depth &&
4986             le32_to_cpu(rec->e_cpos) == cpos) {
4987                 /*
4988                  * Changing the leftmost offset (via partial or whole
4989                  * record truncate) of an interior (or rightmost) path
4990                  * means we have to update the subtree that is formed
4991                  * by this leaf and the one to it's left.
4992                  *
4993                  * There are two cases we can skip:
4994                  *   1) Path is the leftmost one in our inode tree.
4995                  *   2) The leaf is rightmost and will be empty after
4996                  *      we remove the extent record - the rotate code
4997                  *      knows how to update the newly formed edge.
4998                  */
4999
5000                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
5001                                                     &left_cpos);
5002                 if (ret) {
5003                         mlog_errno(ret);
5004                         goto out;
5005                 }
5006
5007                 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5008                         left_path = ocfs2_new_path(path_root_bh(path),
5009                                                    path_root_el(path));
5010                         if (!left_path) {
5011                                 ret = -ENOMEM;
5012                                 mlog_errno(ret);
5013                                 goto out;
5014                         }
5015
5016                         ret = ocfs2_find_path(inode, left_path, left_cpos);
5017                         if (ret) {
5018                                 mlog_errno(ret);
5019                                 goto out;
5020                         }
5021                 }
5022         }
5023
5024         ret = ocfs2_extend_rotate_transaction(handle, 0,
5025                                               handle->h_buffer_credits,
5026                                               path);
5027         if (ret) {
5028                 mlog_errno(ret);
5029                 goto out;
5030         }
5031
5032         ret = ocfs2_journal_access_path(inode, handle, path);
5033         if (ret) {
5034                 mlog_errno(ret);
5035                 goto out;
5036         }
5037
5038         ret = ocfs2_journal_access_path(inode, handle, left_path);
5039         if (ret) {
5040                 mlog_errno(ret);
5041                 goto out;
5042         }
5043
5044         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5045         trunc_range = cpos + len;
5046
5047         if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5048                 int next_free;
5049
5050                 memset(rec, 0, sizeof(*rec));
5051                 ocfs2_cleanup_merge(el, index);
5052                 wants_rotate = 1;
5053
5054                 next_free = le16_to_cpu(el->l_next_free_rec);
5055                 if (is_rightmost_tree_rec && next_free > 1) {
5056                         /*
5057                          * We skip the edge update if this path will
5058                          * be deleted by the rotate code.
5059                          */
5060                         rec = &el->l_recs[next_free - 1];
5061                         ocfs2_adjust_rightmost_records(inode, handle, path,
5062                                                        rec);
5063                 }
5064         } else if (le32_to_cpu(rec->e_cpos) == cpos) {
5065                 /* Remove leftmost portion of the record. */
5066                 le32_add_cpu(&rec->e_cpos, len);
5067                 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5068                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5069         } else if (rec_range == trunc_range) {
5070                 /* Remove rightmost portion of the record */
5071                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5072                 if (is_rightmost_tree_rec)
5073                         ocfs2_adjust_rightmost_records(inode, handle, path, rec);
5074         } else {
5075                 /* Caller should have trapped this. */
5076                 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
5077                      "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
5078                      le32_to_cpu(rec->e_cpos),
5079                      le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5080                 BUG();
5081         }
5082
5083         if (left_path) {
5084                 int subtree_index;
5085
5086                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
5087                 ocfs2_complete_edge_insert(inode, handle, left_path, path,
5088                                            subtree_index);
5089         }
5090
5091         ocfs2_journal_dirty(handle, path_leaf_bh(path));
5092
5093         ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5094         if (ret) {
5095                 mlog_errno(ret);
5096                 goto out;
5097         }
5098
5099 out:
5100         ocfs2_free_path(left_path);
5101         return ret;
5102 }
5103
5104 int ocfs2_remove_extent(struct inode *inode,
5105                         struct ocfs2_extent_tree *et,
5106                         u32 cpos, u32 len, handle_t *handle,
5107                         struct ocfs2_alloc_context *meta_ac,
5108                         struct ocfs2_cached_dealloc_ctxt *dealloc)
5109 {
5110         int ret, index;
5111         u32 rec_range, trunc_range;
5112         struct ocfs2_extent_rec *rec;
5113         struct ocfs2_extent_list *el;
5114         struct ocfs2_path *path = NULL;
5115
5116         ocfs2_extent_map_trunc(inode, 0);
5117
5118         path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
5119         if (!path) {
5120                 ret = -ENOMEM;
5121                 mlog_errno(ret);
5122                 goto out;
5123         }
5124
5125         ret = ocfs2_find_path(inode, path, cpos);
5126         if (ret) {
5127                 mlog_errno(ret);
5128                 goto out;
5129         }
5130
5131         el = path_leaf_el(path);
5132         index = ocfs2_search_extent_list(el, cpos);
5133         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5134                 ocfs2_error(inode->i_sb,
5135                             "Inode %llu has an extent at cpos %u which can no "
5136                             "longer be found.\n",
5137                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
5138                 ret = -EROFS;
5139                 goto out;
5140         }
5141
5142         /*
5143          * We have 3 cases of extent removal:
5144          *   1) Range covers the entire extent rec
5145          *   2) Range begins or ends on one edge of the extent rec
5146          *   3) Range is in the middle of the extent rec (no shared edges)
5147          *
5148          * For case 1 we remove the extent rec and left rotate to
5149          * fill the hole.
5150          *
5151          * For case 2 we just shrink the existing extent rec, with a
5152          * tree update if the shrinking edge is also the edge of an
5153          * extent block.
5154          *
5155          * For case 3 we do a right split to turn the extent rec into
5156          * something case 2 can handle.
5157          */
5158         rec = &el->l_recs[index];
5159         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5160         trunc_range = cpos + len;
5161
5162         BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5163
5164         mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
5165              "(cpos %u, len %u)\n",
5166              (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
5167              le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5168
5169         if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5170                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5171                                          cpos, len, et);
5172                 if (ret) {
5173                         mlog_errno(ret);
5174                         goto out;
5175                 }
5176         } else {
5177                 ret = ocfs2_split_tree(inode, et, handle, path, index,
5178                                        trunc_range, meta_ac);
5179                 if (ret) {
5180                         mlog_errno(ret);
5181                         goto out;
5182                 }
5183
5184                 /*
5185                  * The split could have manipulated the tree enough to
5186                  * move the record location, so we have to look for it again.
5187                  */
5188                 ocfs2_reinit_path(path, 1);
5189
5190                 ret = ocfs2_find_path(inode, path, cpos);
5191                 if (ret) {
5192                         mlog_errno(ret);
5193                         goto out;
5194                 }
5195
5196                 el = path_leaf_el(path);
5197                 index = ocfs2_search_extent_list(el, cpos);
5198                 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5199                         ocfs2_error(inode->i_sb,
5200                                     "Inode %llu: split at cpos %u lost record.",
5201                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5202                                     cpos);
5203                         ret = -EROFS;
5204                         goto out;
5205                 }
5206
5207                 /*
5208                  * Double check our values here. If anything is fishy,
5209                  * it's easier to catch it at the top level.
5210                  */
5211                 rec = &el->l_recs[index];
5212                 rec_range = le32_to_cpu(rec->e_cpos) +
5213                         ocfs2_rec_clusters(el, rec);
5214                 if (rec_range != trunc_range) {
5215                         ocfs2_error(inode->i_sb,
5216                                     "Inode %llu: error after split at cpos %u"
5217                                     "trunc len %u, existing record is (%u,%u)",
5218                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5219                                     cpos, len, le32_to_cpu(rec->e_cpos),
5220                                     ocfs2_rec_clusters(el, rec));
5221                         ret = -EROFS;
5222                         goto out;
5223                 }
5224
5225                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5226                                          cpos, len, et);
5227                 if (ret) {
5228                         mlog_errno(ret);
5229                         goto out;
5230                 }
5231         }
5232
5233 out:
5234         ocfs2_free_path(path);
5235         return ret;
5236 }
5237
5238 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5239 {
5240         struct buffer_head *tl_bh = osb->osb_tl_bh;
5241         struct ocfs2_dinode *di;
5242         struct ocfs2_truncate_log *tl;
5243
5244         di = (struct ocfs2_dinode *) tl_bh->b_data;
5245         tl = &di->id2.i_dealloc;
5246
5247         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5248                         "slot %d, invalid truncate log parameters: used = "
5249                         "%u, count = %u\n", osb->slot_num,
5250                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5251         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5252 }
5253
5254 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5255                                            unsigned int new_start)
5256 {
5257         unsigned int tail_index;
5258         unsigned int current_tail;
5259
5260         /* No records, nothing to coalesce */
5261         if (!le16_to_cpu(tl->tl_used))
5262                 return 0;
5263
5264         tail_index = le16_to_cpu(tl->tl_used) - 1;
5265         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5266         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5267
5268         return current_tail == new_start;
5269 }
5270
5271 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5272                               handle_t *handle,
5273                               u64 start_blk,
5274                               unsigned int num_clusters)
5275 {
5276         int status, index;
5277         unsigned int start_cluster, tl_count;
5278         struct inode *tl_inode = osb->osb_tl_inode;
5279         struct buffer_head *tl_bh = osb->osb_tl_bh;
5280         struct ocfs2_dinode *di;
5281         struct ocfs2_truncate_log *tl;
5282
5283         mlog_entry("start_blk = %llu, num_clusters = %u\n",
5284                    (unsigned long long)start_blk, num_clusters);
5285
5286         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5287
5288         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5289
5290         di = (struct ocfs2_dinode *) tl_bh->b_data;
5291         tl = &di->id2.i_dealloc;
5292         if (!OCFS2_IS_VALID_DINODE(di)) {
5293                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5294                 status = -EIO;
5295                 goto bail;
5296         }
5297
5298         tl_count = le16_to_cpu(tl->tl_count);
5299         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5300                         tl_count == 0,
5301                         "Truncate record count on #%llu invalid "
5302                         "wanted %u, actual %u\n",
5303                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5304                         ocfs2_truncate_recs_per_inode(osb->sb),
5305                         le16_to_cpu(tl->tl_count));
5306
5307         /* Caller should have known to flush before calling us. */
5308         index = le16_to_cpu(tl->tl_used);
5309         if (index >= tl_count) {
5310                 status = -ENOSPC;
5311                 mlog_errno(status);
5312                 goto bail;
5313         }
5314
5315         status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5316                                       OCFS2_JOURNAL_ACCESS_WRITE);
5317         if (status < 0) {
5318                 mlog_errno(status);
5319                 goto bail;
5320         }
5321
5322         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5323              "%llu (index = %d)\n", num_clusters, start_cluster,
5324              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5325
5326         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5327                 /*
5328                  * Move index back to the record we are coalescing with.
5329                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5330                  */
5331                 index--;
5332
5333                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5334                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5335                      index, le32_to_cpu(tl->tl_recs[index].t_start),
5336                      num_clusters);
5337         } else {
5338                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5339                 tl->tl_used = cpu_to_le16(index + 1);
5340         }
5341         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5342
5343         status = ocfs2_journal_dirty(handle, tl_bh);
5344         if (status < 0) {
5345                 mlog_errno(status);
5346                 goto bail;
5347         }
5348
5349 bail:
5350         mlog_exit(status);
5351         return status;
5352 }
5353
5354 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5355                                          handle_t *handle,
5356                                          struct inode *data_alloc_inode,
5357                                          struct buffer_head *data_alloc_bh)
5358 {
5359         int status = 0;
5360         int i;
5361         unsigned int num_clusters;
5362         u64 start_blk;
5363         struct ocfs2_truncate_rec rec;
5364         struct ocfs2_dinode *di;
5365         struct ocfs2_truncate_log *tl;
5366         struct inode *tl_inode = osb->osb_tl_inode;
5367         struct buffer_head *tl_bh = osb->osb_tl_bh;
5368
5369         mlog_entry_void();
5370
5371         di = (struct ocfs2_dinode *) tl_bh->b_data;
5372         tl = &di->id2.i_dealloc;
5373         i = le16_to_cpu(tl->tl_used) - 1;
5374         while (i >= 0) {
5375                 /* Caller has given us at least enough credits to
5376                  * update the truncate log dinode */
5377                 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5378                                               OCFS2_JOURNAL_ACCESS_WRITE);
5379                 if (status < 0) {
5380                         mlog_errno(status);
5381                         goto bail;
5382                 }
5383
5384                 tl->tl_used = cpu_to_le16(i);
5385
5386                 status = ocfs2_journal_dirty(handle, tl_bh);
5387                 if (status < 0) {
5388                         mlog_errno(status);
5389                         goto bail;
5390                 }
5391
5392                 /* TODO: Perhaps we can calculate the bulk of the
5393                  * credits up front rather than extending like
5394                  * this. */
5395                 status = ocfs2_extend_trans(handle,
5396                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5397                 if (status < 0) {
5398                         mlog_errno(status);
5399                         goto bail;
5400                 }
5401
5402                 rec = tl->tl_recs[i];
5403                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5404                                                     le32_to_cpu(rec.t_start));
5405                 num_clusters = le32_to_cpu(rec.t_clusters);
5406
5407                 /* if start_blk is not set, we ignore the record as
5408                  * invalid. */
5409                 if (start_blk) {
5410                         mlog(0, "free record %d, start = %u, clusters = %u\n",
5411                              i, le32_to_cpu(rec.t_start), num_clusters);
5412
5413                         status = ocfs2_free_clusters(handle, data_alloc_inode,
5414                                                      data_alloc_bh, start_blk,
5415                                                      num_clusters);
5416                         if (status < 0) {
5417                                 mlog_errno(status);
5418                                 goto bail;
5419                         }
5420                 }
5421                 i--;
5422         }
5423
5424 bail:
5425         mlog_exit(status);
5426         return status;
5427 }
5428
5429 /* Expects you to already be holding tl_inode->i_mutex */
5430 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5431 {
5432         int status;
5433         unsigned int num_to_flush;
5434         handle_t *handle;
5435         struct inode *tl_inode = osb->osb_tl_inode;
5436         struct inode *data_alloc_inode = NULL;
5437         struct buffer_head *tl_bh = osb->osb_tl_bh;
5438         struct buffer_head *data_alloc_bh = NULL;
5439         struct ocfs2_dinode *di;
5440         struct ocfs2_truncate_log *tl;
5441
5442         mlog_entry_void();
5443
5444         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5445
5446         di = (struct ocfs2_dinode *) tl_bh->b_data;
5447         tl = &di->id2.i_dealloc;
5448         if (!OCFS2_IS_VALID_DINODE(di)) {
5449                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5450                 status = -EIO;
5451                 goto out;
5452         }
5453
5454         num_to_flush = le16_to_cpu(tl->tl_used);
5455         mlog(0, "Flush %u records from truncate log #%llu\n",
5456              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5457         if (!num_to_flush) {
5458                 status = 0;
5459                 goto out;
5460         }
5461
5462         data_alloc_inode = ocfs2_get_system_file_inode(osb,
5463                                                        GLOBAL_BITMAP_SYSTEM_INODE,
5464                                                        OCFS2_INVALID_SLOT);
5465         if (!data_alloc_inode) {
5466                 status = -EINVAL;
5467                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
5468                 goto out;
5469         }
5470
5471         mutex_lock(&data_alloc_inode->i_mutex);
5472
5473         status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5474         if (status < 0) {
5475                 mlog_errno(status);
5476                 goto out_mutex;
5477         }
5478
5479         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5480         if (IS_ERR(handle)) {
5481                 status = PTR_ERR(handle);
5482                 mlog_errno(status);
5483                 goto out_unlock;
5484         }
5485
5486         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5487                                                data_alloc_bh);
5488         if (status < 0)
5489                 mlog_errno(status);
5490
5491         ocfs2_commit_trans(osb, handle);
5492
5493 out_unlock:
5494         brelse(data_alloc_bh);
5495         ocfs2_inode_unlock(data_alloc_inode, 1);
5496
5497 out_mutex:
5498         mutex_unlock(&data_alloc_inode->i_mutex);
5499         iput(data_alloc_inode);
5500
5501 out:
5502         mlog_exit(status);
5503         return status;
5504 }
5505
5506 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5507 {
5508         int status;
5509         struct inode *tl_inode = osb->osb_tl_inode;
5510
5511         mutex_lock(&tl_inode->i_mutex);
5512         status = __ocfs2_flush_truncate_log(osb);
5513         mutex_unlock(&tl_inode->i_mutex);
5514
5515         return status;
5516 }
5517
5518 static void ocfs2_truncate_log_worker(struct work_struct *work)
5519 {
5520         int status;
5521         struct ocfs2_super *osb =
5522                 container_of(work, struct ocfs2_super,
5523                              osb_truncate_log_wq.work);
5524
5525         mlog_entry_void();
5526
5527         status = ocfs2_flush_truncate_log(osb);
5528         if (status < 0)
5529                 mlog_errno(status);
5530         else
5531                 ocfs2_init_inode_steal_slot(osb);
5532
5533         mlog_exit(status);
5534 }
5535
5536 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5537 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5538                                        int cancel)
5539 {
5540         if (osb->osb_tl_inode) {
5541                 /* We want to push off log flushes while truncates are
5542                  * still running. */
5543                 if (cancel)
5544                         cancel_delayed_work(&osb->osb_truncate_log_wq);
5545
5546                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5547                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5548         }
5549 }
5550
5551 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5552                                        int slot_num,
5553                                        struct inode **tl_inode,
5554                                        struct buffer_head **tl_bh)
5555 {
5556         int status;
5557         struct inode *inode = NULL;
5558         struct buffer_head *bh = NULL;
5559
5560         inode = ocfs2_get_system_file_inode(osb,
5561                                            TRUNCATE_LOG_SYSTEM_INODE,
5562                                            slot_num);
5563         if (!inode) {
5564                 status = -EINVAL;
5565                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5566                 goto bail;
5567         }
5568
5569         status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
5570                                   OCFS2_BH_CACHED, inode);
5571         if (status < 0) {
5572                 iput(inode);
5573                 mlog_errno(status);
5574                 goto bail;
5575         }
5576
5577         *tl_inode = inode;
5578         *tl_bh    = bh;
5579 bail:
5580         mlog_exit(status);
5581         return status;
5582 }
5583
5584 /* called during the 1st stage of node recovery. we stamp a clean
5585  * truncate log and pass back a copy for processing later. if the
5586  * truncate log does not require processing, a *tl_copy is set to
5587  * NULL. */
5588 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5589                                       int slot_num,
5590                                       struct ocfs2_dinode **tl_copy)
5591 {
5592         int status;
5593         struct inode *tl_inode = NULL;
5594         struct buffer_head *tl_bh = NULL;
5595         struct ocfs2_dinode *di;
5596         struct ocfs2_truncate_log *tl;
5597
5598         *tl_copy = NULL;
5599
5600         mlog(0, "recover truncate log from slot %d\n", slot_num);
5601
5602         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5603         if (status < 0) {
5604                 mlog_errno(status);
5605                 goto bail;
5606         }
5607
5608         di = (struct ocfs2_dinode *) tl_bh->b_data;
5609         tl = &di->id2.i_dealloc;
5610         if (!OCFS2_IS_VALID_DINODE(di)) {
5611                 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
5612                 status = -EIO;
5613                 goto bail;
5614         }
5615
5616         if (le16_to_cpu(tl->tl_used)) {
5617                 mlog(0, "We'll have %u logs to recover\n",
5618                      le16_to_cpu(tl->tl_used));
5619
5620                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
5621                 if (!(*tl_copy)) {
5622                         status = -ENOMEM;
5623                         mlog_errno(status);
5624                         goto bail;
5625                 }
5626
5627                 /* Assuming the write-out below goes well, this copy
5628                  * will be passed back to recovery for processing. */
5629                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
5630
5631                 /* All we need to do to clear the truncate log is set
5632                  * tl_used. */
5633                 tl->tl_used = 0;
5634
5635                 status = ocfs2_write_block(osb, tl_bh, tl_inode);
5636                 if (status < 0) {
5637                         mlog_errno(status);
5638                         goto bail;
5639                 }
5640         }
5641
5642 bail:
5643         if (tl_inode)
5644                 iput(tl_inode);
5645         if (tl_bh)
5646                 brelse(tl_bh);
5647
5648         if (status < 0 && (*tl_copy)) {
5649                 kfree(*tl_copy);
5650                 *tl_copy = NULL;
5651         }
5652
5653         mlog_exit(status);
5654         return status;
5655 }
5656
5657 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
5658                                          struct ocfs2_dinode *tl_copy)
5659 {
5660         int status = 0;
5661         int i;
5662         unsigned int clusters, num_recs, start_cluster;
5663         u64 start_blk;
5664         handle_t *handle;
5665         struct inode *tl_inode = osb->osb_tl_inode;
5666         struct ocfs2_truncate_log *tl;
5667
5668         mlog_entry_void();
5669
5670         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
5671                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
5672                 return -EINVAL;
5673         }
5674
5675         tl = &tl_copy->id2.i_dealloc;
5676         num_recs = le16_to_cpu(tl->tl_used);
5677         mlog(0, "cleanup %u records from %llu\n", num_recs,
5678              (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
5679
5680         mutex_lock(&tl_inode->i_mutex);
5681         for(i = 0; i < num_recs; i++) {
5682                 if (ocfs2_truncate_log_needs_flush(osb)) {
5683                         status = __ocfs2_flush_truncate_log(osb);
5684                         if (status < 0) {
5685                                 mlog_errno(status);
5686                                 goto bail_up;
5687                         }
5688                 }
5689
5690                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5691                 if (IS_ERR(handle)) {
5692                         status = PTR_ERR(handle);
5693                         mlog_errno(status);
5694                         goto bail_up;
5695                 }
5696
5697                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
5698                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
5699                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
5700
5701                 status = ocfs2_truncate_log_append(osb, handle,
5702                                                    start_blk, clusters);
5703                 ocfs2_commit_trans(osb, handle);
5704                 if (status < 0) {
5705                         mlog_errno(status);
5706                         goto bail_up;
5707                 }
5708         }
5709
5710 bail_up:
5711         mutex_unlock(&tl_inode->i_mutex);
5712
5713         mlog_exit(status);
5714         return status;
5715 }
5716
5717 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
5718 {
5719         int status;
5720         struct inode *tl_inode = osb->osb_tl_inode;
5721
5722         mlog_entry_void();
5723
5724         if (tl_inode) {
5725                 cancel_delayed_work(&osb->osb_truncate_log_wq);
5726                 flush_workqueue(ocfs2_wq);
5727
5728                 status = ocfs2_flush_truncate_log(osb);
5729                 if (status < 0)
5730                         mlog_errno(status);
5731
5732                 brelse(osb->osb_tl_bh);
5733                 iput(osb->osb_tl_inode);
5734         }
5735
5736         mlog_exit_void();
5737 }
5738
5739 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5740 {
5741         int status;
5742         struct inode *tl_inode = NULL;
5743         struct buffer_head *tl_bh = NULL;
5744
5745         mlog_entry_void();
5746
5747         status = ocfs2_get_truncate_log_info(osb,
5748                                              osb->slot_num,
5749                                              &tl_inode,
5750                                              &tl_bh);
5751         if (status < 0)
5752                 mlog_errno(status);
5753
5754         /* ocfs2_truncate_log_shutdown keys on the existence of
5755          * osb->osb_tl_inode so we don't set any of the osb variables
5756          * until we're sure all is well. */
5757         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
5758                           ocfs2_truncate_log_worker);
5759         osb->osb_tl_bh    = tl_bh;
5760         osb->osb_tl_inode = tl_inode;
5761
5762         mlog_exit(status);
5763         return status;
5764 }
5765
5766 /*
5767  * Delayed de-allocation of suballocator blocks.
5768  *
5769  * Some sets of block de-allocations might involve multiple suballocator inodes.
5770  *
5771  * The locking for this can get extremely complicated, especially when
5772  * the suballocator inodes to delete from aren't known until deep
5773  * within an unrelated codepath.
5774  *
5775  * ocfs2_extent_block structures are a good example of this - an inode
5776  * btree could have been grown by any number of nodes each allocating
5777  * out of their own suballoc inode.
5778  *
5779  * These structures allow the delay of block de-allocation until a
5780  * later time, when locking of multiple cluster inodes won't cause
5781  * deadlock.
5782  */
5783
5784 /*
5785  * Describes a single block free from a suballocator
5786  */
5787 struct ocfs2_cached_block_free {
5788         struct ocfs2_cached_block_free          *free_next;
5789         u64                                     free_blk;
5790         unsigned int                            free_bit;
5791 };
5792
5793 struct ocfs2_per_slot_free_list {
5794         struct ocfs2_per_slot_free_list         *f_next_suballocator;
5795         int                                     f_inode_type;
5796         int                                     f_slot;
5797         struct ocfs2_cached_block_free          *f_first;
5798 };
5799
5800 static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5801                                    int sysfile_type,
5802                                    int slot,
5803                                    struct ocfs2_cached_block_free *head)
5804 {
5805         int ret;
5806         u64 bg_blkno;
5807         handle_t *handle;
5808         struct inode *inode;
5809         struct buffer_head *di_bh = NULL;
5810         struct ocfs2_cached_block_free *tmp;
5811
5812         inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5813         if (!inode) {
5814                 ret = -EINVAL;
5815                 mlog_errno(ret);
5816                 goto out;
5817         }
5818
5819         mutex_lock(&inode->i_mutex);
5820
5821         ret = ocfs2_inode_lock(inode, &di_bh, 1);
5822         if (ret) {
5823                 mlog_errno(ret);
5824                 goto out_mutex;
5825         }
5826
5827         handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5828         if (IS_ERR(handle)) {
5829                 ret = PTR_ERR(handle);
5830                 mlog_errno(ret);
5831                 goto out_unlock;
5832         }
5833
5834         while (head) {
5835                 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5836                                                       head->free_bit);
5837                 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5838                      head->free_bit, (unsigned long long)head->free_blk);
5839
5840                 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5841                                                head->free_bit, bg_blkno, 1);
5842                 if (ret) {
5843                         mlog_errno(ret);
5844                         goto out_journal;
5845                 }
5846
5847                 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5848                 if (ret) {
5849                         mlog_errno(ret);
5850                         goto out_journal;
5851                 }
5852
5853                 tmp = head;
5854                 head = head->free_next;
5855                 kfree(tmp);
5856         }
5857
5858 out_journal:
5859         ocfs2_commit_trans(osb, handle);
5860
5861 out_unlock:
5862         ocfs2_inode_unlock(inode, 1);
5863         brelse(di_bh);
5864 out_mutex:
5865         mutex_unlock(&inode->i_mutex);
5866         iput(inode);
5867 out:
5868         while(head) {
5869                 /* Premature exit may have left some dangling items. */
5870                 tmp = head;
5871                 head = head->free_next;
5872                 kfree(tmp);
5873         }
5874
5875         return ret;
5876 }
5877
5878 int ocfs2_run_deallocs(struct ocfs2_super *osb,
5879                        struct ocfs2_cached_dealloc_ctxt *ctxt)
5880 {
5881         int ret = 0, ret2;
5882         struct ocfs2_per_slot_free_list *fl;
5883
5884         if (!ctxt)
5885                 return 0;
5886
5887         while (ctxt->c_first_suballocator) {
5888                 fl = ctxt->c_first_suballocator;
5889
5890                 if (fl->f_first) {
5891                         mlog(0, "Free items: (type %u, slot %d)\n",
5892                              fl->f_inode_type, fl->f_slot);
5893                         ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
5894                                                        fl->f_slot, fl->f_first);
5895                         if (ret2)
5896                                 mlog_errno(ret2);
5897                         if (!ret)
5898                                 ret = ret2;
5899                 }
5900
5901                 ctxt->c_first_suballocator = fl->f_next_suballocator;
5902                 kfree(fl);
5903         }
5904
5905         return ret;
5906 }
5907
5908 static struct ocfs2_per_slot_free_list *
5909 ocfs2_find_per_slot_free_list(int type,
5910                               int slot,
5911                               struct ocfs2_cached_dealloc_ctxt *ctxt)
5912 {
5913         struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
5914
5915         while (fl) {
5916                 if (fl->f_inode_type == type && fl->f_slot == slot)
5917                         return fl;
5918
5919                 fl = fl->f_next_suballocator;
5920         }
5921
5922         fl = kmalloc(sizeof(*fl), GFP_NOFS);
5923         if (fl) {
5924                 fl->f_inode_type = type;
5925                 fl->f_slot = slot;
5926                 fl->f_first = NULL;
5927                 fl->f_next_suballocator = ctxt->c_first_suballocator;
5928
5929                 ctxt->c_first_suballocator = fl;
5930         }
5931         return fl;
5932 }
5933
5934 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
5935                                      int type, int slot, u64 blkno,
5936                                      unsigned int bit)
5937 {
5938         int ret;
5939         struct ocfs2_per_slot_free_list *fl;
5940         struct ocfs2_cached_block_free *item;
5941
5942         fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
5943         if (fl == NULL) {
5944                 ret = -ENOMEM;
5945                 mlog_errno(ret);
5946                 goto out;
5947         }
5948
5949         item = kmalloc(sizeof(*item), GFP_NOFS);
5950         if (item == NULL) {
5951                 ret = -ENOMEM;
5952                 mlog_errno(ret);
5953                 goto out;
5954         }
5955
5956         mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
5957              type, slot, bit, (unsigned long long)blkno);
5958
5959         item->free_blk = blkno;
5960         item->free_bit = bit;
5961         item->free_next = fl->f_first;
5962
5963         fl->f_first = item;
5964
5965         ret = 0;
5966 out:
5967         return ret;
5968 }
5969
5970 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
5971                                          struct ocfs2_extent_block *eb)
5972 {
5973         return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
5974                                          le16_to_cpu(eb->h_suballoc_slot),
5975                                          le64_to_cpu(eb->h_blkno),
5976                                          le16_to_cpu(eb->h_suballoc_bit));
5977 }
5978
5979 /* This function will figure out whether the currently last extent
5980  * block will be deleted, and if it will, what the new last extent
5981  * block will be so we can update his h_next_leaf_blk field, as well
5982  * as the dinodes i_last_eb_blk */
5983 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
5984                                        unsigned int clusters_to_del,
5985                                        struct ocfs2_path *path,
5986                                        struct buffer_head **new_last_eb)
5987 {
5988         int next_free, ret = 0;
5989         u32 cpos;
5990         struct ocfs2_extent_rec *rec;
5991         struct ocfs2_extent_block *eb;
5992         struct ocfs2_extent_list *el;
5993         struct buffer_head *bh = NULL;
5994
5995         *new_last_eb = NULL;
5996
5997         /* we have no tree, so of course, no last_eb. */
5998         if (!path->p_tree_depth)
5999                 goto out;
6000
6001         /* trunc to zero special case - this makes tree_depth = 0
6002          * regardless of what it is.  */
6003         if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6004                 goto out;
6005
6006         el = path_leaf_el(path);
6007         BUG_ON(!el->l_next_free_rec);
6008
6009         /*
6010          * Make sure that this extent list will actually be empty
6011          * after we clear away the data. We can shortcut out if
6012          * there's more than one non-empty extent in the
6013          * list. Otherwise, a check of the remaining extent is
6014          * necessary.
6015          */
6016         next_free = le16_to_cpu(el->l_next_free_rec);
6017         rec = NULL;
6018         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6019                 if (next_free > 2)
6020                         goto out;
6021
6022                 /* We may have a valid extent in index 1, check it. */
6023                 if (next_free == 2)
6024                         rec = &el->l_recs[1];
6025
6026                 /*
6027                  * Fall through - no more nonempty extents, so we want
6028                  * to delete this leaf.
6029                  */
6030         } else {
6031                 if (next_free > 1)
6032                         goto out;
6033
6034                 rec = &el->l_recs[0];
6035         }
6036
6037         if (rec) {
6038                 /*
6039                  * Check it we'll only be trimming off the end of this
6040                  * cluster.
6041                  */
6042                 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6043                         goto out;
6044         }
6045
6046         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6047         if (ret) {
6048                 mlog_errno(ret);
6049                 goto out;
6050         }
6051
6052         ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
6053         if (ret) {
6054                 mlog_errno(ret);
6055                 goto out;
6056         }
6057
6058         eb = (struct ocfs2_extent_block *) bh->b_data;
6059         el = &eb->h_list;
6060         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6061                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6062                 ret = -EROFS;
6063                 goto out;
6064         }
6065
6066         *new_last_eb = bh;
6067         get_bh(*new_last_eb);
6068         mlog(0, "returning block %llu, (cpos: %u)\n",
6069              (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6070 out:
6071         brelse(bh);
6072
6073         return ret;
6074 }
6075
6076 /*
6077  * Trim some clusters off the rightmost edge of a tree. Only called
6078  * during truncate.
6079  *
6080  * The caller needs to:
6081  *   - start journaling of each path component.
6082  *   - compute and fully set up any new last ext block
6083  */
6084 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6085                            handle_t *handle, struct ocfs2_truncate_context *tc,
6086                            u32 clusters_to_del, u64 *delete_start)
6087 {
6088         int ret, i, index = path->p_tree_depth;
6089         u32 new_edge = 0;
6090         u64 deleted_eb = 0;
6091         struct buffer_head *bh;
6092         struct ocfs2_extent_list *el;
6093         struct ocfs2_extent_rec *rec;
6094
6095         *delete_start = 0;
6096
6097         while (index >= 0) {
6098                 bh = path->p_node[index].bh;
6099                 el = path->p_node[index].el;
6100
6101                 mlog(0, "traveling tree (index = %d, block = %llu)\n",
6102                      index,  (unsigned long long)bh->b_blocknr);
6103
6104                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6105
6106                 if (index !=
6107                     (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6108                         ocfs2_error(inode->i_sb,
6109                                     "Inode %lu has invalid ext. block %llu",
6110                                     inode->i_ino,
6111                                     (unsigned long long)bh->b_blocknr);
6112                         ret = -EROFS;
6113                         goto out;
6114                 }
6115
6116 find_tail_record:
6117                 i = le16_to_cpu(el->l_next_free_rec) - 1;
6118                 rec = &el->l_recs[i];
6119
6120                 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6121                      "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6122                      ocfs2_rec_clusters(el, rec),
6123                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6124                      le16_to_cpu(el->l_next_free_rec));
6125
6126                 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6127
6128                 if (le16_to_cpu(el->l_tree_depth) == 0) {
6129                         /*
6130                          * If the leaf block contains a single empty
6131                          * extent and no records, we can just remove
6132                          * the block.
6133                          */
6134                         if (i == 0 && ocfs2_is_empty_extent(rec)) {
6135                                 memset(rec, 0,
6136                                        sizeof(struct ocfs2_extent_rec));
6137                                 el->l_next_free_rec = cpu_to_le16(0);
6138
6139                                 goto delete;
6140                         }
6141
6142                         /*
6143                          * Remove any empty extents by shifting things
6144                          * left. That should make life much easier on
6145                          * the code below. This condition is rare
6146                          * enough that we shouldn't see a performance
6147                          * hit.
6148                          */
6149                         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6150                                 le16_add_cpu(&el->l_next_free_rec, -1);
6151
6152                                 for(i = 0;
6153                                     i < le16_to_cpu(el->l_next_free_rec); i++)
6154                                         el->l_recs[i] = el->l_recs[i + 1];
6155
6156                                 memset(&el->l_recs[i], 0,
6157                                        sizeof(struct ocfs2_extent_rec));
6158
6159                                 /*
6160                                  * We've modified our extent list. The
6161                                  * simplest way to handle this change
6162                                  * is to being the search from the
6163                                  * start again.
6164                                  */
6165                                 goto find_tail_record;
6166                         }
6167
6168                         le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6169
6170                         /*
6171                          * We'll use "new_edge" on our way back up the
6172                          * tree to know what our rightmost cpos is.
6173                          */
6174                         new_edge = le16_to_cpu(rec->e_leaf_clusters);
6175                         new_edge += le32_to_cpu(rec->e_cpos);
6176
6177                         /*
6178                          * The caller will use this to delete data blocks.
6179                          */
6180                         *delete_start = le64_to_cpu(rec->e_blkno)
6181                                 + ocfs2_clusters_to_blocks(inode->i_sb,
6182                                         le16_to_cpu(rec->e_leaf_clusters));
6183
6184                         /*
6185                          * If it's now empty, remove this record.
6186                          */
6187                         if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6188                                 memset(rec, 0,
6189                                        sizeof(struct ocfs2_extent_rec));
6190                                 le16_add_cpu(&el->l_next_free_rec, -1);
6191                         }
6192                 } else {
6193                         if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6194                                 memset(rec, 0,
6195                                        sizeof(struct ocfs2_extent_rec));
6196                                 le16_add_cpu(&el->l_next_free_rec, -1);
6197
6198                                 goto delete;
6199                         }
6200
6201                         /* Can this actually happen? */
6202                         if (le16_to_cpu(el->l_next_free_rec) == 0)
6203                                 goto delete;
6204
6205                         /*
6206                          * We never actually deleted any clusters
6207                          * because our leaf was empty. There's no
6208                          * reason to adjust the rightmost edge then.
6209                          */
6210                         if (new_edge == 0)
6211                                 goto delete;
6212
6213                         rec->e_int_clusters = cpu_to_le32(new_edge);
6214                         le32_add_cpu(&rec->e_int_clusters,
6215                                      -le32_to_cpu(rec->e_cpos));
6216
6217                          /*
6218                           * A deleted child record should have been
6219                           * caught above.
6220                           */
6221                          BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6222                 }
6223
6224 delete:
6225                 ret = ocfs2_journal_dirty(handle, bh);
6226                 if (ret) {
6227                         mlog_errno(ret);
6228                         goto out;
6229                 }
6230
6231                 mlog(0, "extent list container %llu, after: record %d: "
6232                      "(%u, %u, %llu), next = %u.\n",
6233                      (unsigned long long)bh->b_blocknr, i,
6234                      le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6235                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6236                      le16_to_cpu(el->l_next_free_rec));
6237
6238                 /*
6239                  * We must be careful to only attempt delete of an
6240                  * extent block (and not the root inode block).
6241                  */
6242                 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6243                         struct ocfs2_extent_block *eb =
6244                                 (struct ocfs2_extent_block *)bh->b_data;
6245
6246                         /*
6247                          * Save this for use when processing the
6248                          * parent block.
6249                          */
6250                         deleted_eb = le64_to_cpu(eb->h_blkno);
6251
6252                         mlog(0, "deleting this extent block.\n");
6253
6254                         ocfs2_remove_from_cache(inode, bh);
6255
6256                         BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6257                         BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6258                         BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6259
6260                         ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6261                         /* An error here is not fatal. */
6262                         if (ret < 0)
6263                                 mlog_errno(ret);
6264                 } else {
6265                         deleted_eb = 0;
6266                 }
6267
6268                 index--;
6269         }
6270
6271         ret = 0;
6272 out:
6273         return ret;
6274 }
6275
6276 static int ocfs2_do_truncate(struct ocfs2_super *osb,
6277                              unsigned int clusters_to_del,
6278                              struct inode *inode,
6279                              struct buffer_head *fe_bh,
6280                              handle_t *handle,
6281                              struct ocfs2_truncate_context *tc,
6282                              struct ocfs2_path *path)
6283 {
6284         int status;
6285         struct ocfs2_dinode *fe;
6286         struct ocfs2_extent_block *last_eb = NULL;
6287         struct ocfs2_extent_list *el;
6288         struct buffer_head *last_eb_bh = NULL;
6289         u64 delete_blk = 0;
6290
6291         fe = (struct ocfs2_dinode *) fe_bh->b_data;
6292
6293         status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6294                                              path, &last_eb_bh);
6295         if (status < 0) {
6296                 mlog_errno(status);
6297                 goto bail;
6298         }
6299
6300         /*
6301          * Each component will be touched, so we might as well journal
6302          * here to avoid having to handle errors later.
6303          */
6304         status = ocfs2_journal_access_path(inode, handle, path);
6305         if (status < 0) {
6306                 mlog_errno(status);
6307                 goto bail;
6308         }
6309
6310         if (last_eb_bh) {
6311                 status = ocfs2_journal_access(handle, inode, last_eb_bh,
6312                                               OCFS2_JOURNAL_ACCESS_WRITE);
6313                 if (status < 0) {
6314                         mlog_errno(status);
6315                         goto bail;
6316                 }
6317
6318                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6319         }
6320
6321         el = &(fe->id2.i_list);
6322
6323         /*
6324          * Lower levels depend on this never happening, but it's best
6325          * to check it up here before changing the tree.
6326          */
6327         if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6328                 ocfs2_error(inode->i_sb,
6329                             "Inode %lu has an empty extent record, depth %u\n",
6330                             inode->i_ino, le16_to_cpu(el->l_tree_depth));
6331                 status = -EROFS;
6332                 goto bail;
6333         }
6334
6335         spin_lock(&OCFS2_I(inode)->ip_lock);
6336         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6337                                       clusters_to_del;
6338         spin_unlock(&OCFS2_I(inode)->ip_lock);
6339         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6340         inode->i_blocks = ocfs2_inode_sector_count(inode);
6341
6342         status = ocfs2_trim_tree(inode, path, handle, tc,
6343                                  clusters_to_del, &delete_blk);
6344         if (status) {
6345                 mlog_errno(status);
6346                 goto bail;
6347         }
6348
6349         if (le32_to_cpu(fe->i_clusters) == 0) {
6350                 /* trunc to zero is a special case. */
6351                 el->l_tree_depth = 0;
6352                 fe->i_last_eb_blk = 0;
6353         } else if (last_eb)
6354                 fe->i_last_eb_blk = last_eb->h_blkno;
6355
6356         status = ocfs2_journal_dirty(handle, fe_bh);
6357         if (status < 0) {
6358                 mlog_errno(status);
6359                 goto bail;
6360         }
6361
6362         if (last_eb) {
6363                 /* If there will be a new last extent block, then by
6364                  * definition, there cannot be any leaves to the right of
6365                  * him. */
6366                 last_eb->h_next_leaf_blk = 0;
6367                 status = ocfs2_journal_dirty(handle, last_eb_bh);
6368                 if (status < 0) {
6369                         mlog_errno(status);
6370                         goto bail;
6371                 }
6372         }
6373
6374         if (delete_blk) {
6375                 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
6376                                                    clusters_to_del);
6377                 if (status < 0) {
6378                         mlog_errno(status);
6379                         goto bail;
6380                 }
6381         }
6382         status = 0;
6383 bail:
6384
6385         mlog_exit(status);
6386         return status;
6387 }
6388
6389 static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
6390 {
6391         set_buffer_uptodate(bh);
6392         mark_buffer_dirty(bh);
6393         return 0;
6394 }
6395
6396 static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
6397 {
6398         set_buffer_uptodate(bh);
6399         mark_buffer_dirty(bh);
6400         return ocfs2_journal_dirty_data(handle, bh);
6401 }
6402
6403 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6404                                      unsigned int from, unsigned int to,
6405                                      struct page *page, int zero, u64 *phys)
6406 {
6407         int ret, partial = 0;
6408
6409         ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6410         if (ret)
6411                 mlog_errno(ret);
6412
6413         if (zero)
6414                 zero_user_segment(page, from, to);
6415
6416         /*
6417          * Need to set the buffers we zero'd into uptodate
6418          * here if they aren't - ocfs2_map_page_blocks()
6419          * might've skipped some
6420          */
6421         if (ocfs2_should_order_data(inode)) {
6422                 ret = walk_page_buffers(handle,
6423                                         page_buffers(page),
6424                                         from, to, &partial,
6425                                         ocfs2_ordered_zero_func);
6426                 if (ret < 0)
6427                         mlog_errno(ret);
6428         } else {
6429                 ret = walk_page_buffers(handle, page_buffers(page),
6430                                         from, to, &partial,
6431                                         ocfs2_writeback_zero_func);
6432                 if (ret < 0)
6433                         mlog_errno(ret);
6434         }
6435
6436         if (!partial)
6437                 SetPageUptodate(page);
6438
6439         flush_dcache_page(page);
6440 }
6441
6442 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6443                                      loff_t end, struct page **pages,
6444                                      int numpages, u64 phys, handle_t *handle)
6445 {
6446         int i;
6447         struct page *page;
6448         unsigned int from, to = PAGE_CACHE_SIZE;
6449         struct super_block *sb = inode->i_sb;
6450
6451         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6452
6453         if (numpages == 0)
6454                 goto out;
6455
6456         to = PAGE_CACHE_SIZE;
6457         for(i = 0; i < numpages; i++) {
6458                 page = pages[i];
6459
6460                 from = start & (PAGE_CACHE_SIZE - 1);
6461                 if ((end >> PAGE_CACHE_SHIFT) == page->index)
6462                         to = end & (PAGE_CACHE_SIZE - 1);
6463
6464                 BUG_ON(from > PAGE_CACHE_SIZE);
6465                 BUG_ON(to > PAGE_CACHE_SIZE);
6466
6467                 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6468                                          &phys);
6469
6470                 start = (page->index + 1) << PAGE_CACHE_SHIFT;
6471         }
6472 out:
6473         if (pages)
6474                 ocfs2_unlock_and_free_pages(pages, numpages);
6475 }
6476
6477 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6478                                 struct page **pages, int *num)
6479 {
6480         int numpages, ret = 0;
6481         struct super_block *sb = inode->i_sb;
6482         struct address_space *mapping = inode->i_mapping;
6483         unsigned long index;
6484         loff_t last_page_bytes;
6485
6486         BUG_ON(start > end);
6487
6488         BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6489                (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6490
6491         numpages = 0;
6492         last_page_bytes = PAGE_ALIGN(end);
6493         index = start >> PAGE_CACHE_SHIFT;
6494         do {
6495                 pages[numpages] = grab_cache_page(mapping, index);
6496                 if (!pages[numpages]) {
6497                         ret = -ENOMEM;
6498                         mlog_errno(ret);
6499                         goto out;
6500                 }
6501
6502                 numpages++;
6503                 index++;
6504         } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6505
6506 out:
6507         if (ret != 0) {
6508                 if (pages)
6509                         ocfs2_unlock_and_free_pages(pages, numpages);
6510                 numpages = 0;
6511         }
6512
6513         *num = numpages;
6514
6515         return ret;
6516 }
6517
6518 /*
6519  * Zero the area past i_size but still within an allocated
6520  * cluster. This avoids exposing nonzero data on subsequent file
6521  * extends.
6522  *
6523  * We need to call this before i_size is updated on the inode because
6524  * otherwise block_write_full_page() will skip writeout of pages past
6525  * i_size. The new_i_size parameter is passed for this reason.
6526  */
6527 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6528                                   u64 range_start, u64 range_end)
6529 {
6530         int ret = 0, numpages;
6531         struct page **pages = NULL;
6532         u64 phys;
6533         unsigned int ext_flags;
6534         struct super_block *sb = inode->i_sb;
6535
6536         /*
6537          * File systems which don't support sparse files zero on every
6538          * extend.
6539          */
6540         if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6541                 return 0;
6542
6543         pages = kcalloc(ocfs2_pages_per_cluster(sb),
6544                         sizeof(struct page *), GFP_NOFS);
6545         if (pages == NULL) {
6546                 ret = -ENOMEM;
6547                 mlog_errno(ret);
6548                 goto out;
6549         }
6550
6551         if (range_start == range_end)
6552                 goto out;
6553
6554         ret = ocfs2_extent_map_get_blocks(inode,
6555                                           range_start >> sb->s_blocksize_bits,
6556                                           &phys, NULL, &ext_flags);
6557         if (ret) {
6558                 mlog_errno(ret);
6559                 goto out;
6560         }
6561
6562         /*
6563          * Tail is a hole, or is marked unwritten. In either case, we
6564          * can count on read and write to return/push zero's.
6565          */
6566         if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6567                 goto out;
6568
6569         ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6570                                    &numpages);
6571         if (ret) {
6572                 mlog_errno(ret);
6573                 goto out;
6574         }
6575
6576         ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6577                                  numpages, phys, handle);
6578
6579         /*
6580          * Initiate writeout of the pages we zero'd here. We don't
6581          * wait on them - the truncate_inode_pages() call later will
6582          * do that for us.
6583          */
6584         ret = do_sync_mapping_range(inode->i_mapping, range_start,
6585                                     range_end - 1, SYNC_FILE_RANGE_WRITE);
6586         if (ret)
6587                 mlog_errno(ret);
6588
6589 out:
6590         if (pages)
6591                 kfree(pages);
6592
6593         return ret;
6594 }
6595
6596 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
6597                                              struct ocfs2_dinode *di)
6598 {
6599         unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6600         unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
6601
6602         if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
6603                 memset(&di->id2, 0, blocksize -
6604                                     offsetof(struct ocfs2_dinode, id2) -
6605                                     xattrsize);
6606         else
6607                 memset(&di->id2, 0, blocksize -
6608                                     offsetof(struct ocfs2_dinode, id2));
6609 }
6610
6611 void ocfs2_dinode_new_extent_list(struct inode *inode,
6612                                   struct ocfs2_dinode *di)
6613 {
6614         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6615         di->id2.i_list.l_tree_depth = 0;
6616         di->id2.i_list.l_next_free_rec = 0;
6617         di->id2.i_list.l_count = cpu_to_le16(
6618                 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
6619 }
6620
6621 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6622 {
6623         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6624         struct ocfs2_inline_data *idata = &di->id2.i_data;
6625
6626         spin_lock(&oi->ip_lock);
6627         oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6628         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6629         spin_unlock(&oi->ip_lock);
6630
6631         /*
6632          * We clear the entire i_data structure here so that all
6633          * fields can be properly initialized.
6634          */
6635         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6636
6637         idata->id_count = cpu_to_le16(
6638                         ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
6639 }
6640
6641 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6642                                          struct buffer_head *di_bh)
6643 {
6644         int ret, i, has_data, num_pages = 0;
6645         handle_t *handle;
6646         u64 uninitialized_var(block);
6647         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6648         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6649         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6650         struct ocfs2_alloc_context *data_ac = NULL;
6651         struct page **pages = NULL;
6652         loff_t end = osb->s_clustersize;
6653         struct ocfs2_extent_tree et;
6654
6655         has_data = i_size_read(inode) ? 1 : 0;
6656
6657         if (has_data) {
6658                 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6659                                 sizeof(struct page *), GFP_NOFS);
6660                 if (pages == NULL) {
6661                         ret = -ENOMEM;
6662                         mlog_errno(ret);
6663                         goto out;
6664                 }
6665
6666                 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6667                 if (ret) {
6668                         mlog_errno(ret);
6669                         goto out;
6670                 }
6671         }
6672
6673         handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
6674         if (IS_ERR(handle)) {
6675                 ret = PTR_ERR(handle);
6676                 mlog_errno(ret);
6677                 goto out_unlock;
6678         }
6679
6680         ret = ocfs2_journal_access(handle, inode, di_bh,
6681                                    OCFS2_JOURNAL_ACCESS_WRITE);
6682         if (ret) {
6683                 mlog_errno(ret);
6684                 goto out_commit;
6685         }
6686
6687         if (has_data) {
6688                 u32 bit_off, num;
6689                 unsigned int page_end;
6690                 u64 phys;
6691
6692                 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6693                                            &num);
6694                 if (ret) {
6695                         mlog_errno(ret);
6696                         goto out_commit;
6697                 }
6698
6699                 /*
6700                  * Save two copies, one for insert, and one that can
6701                  * be changed by ocfs2_map_and_dirty_page() below.
6702                  */
6703                 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6704
6705                 /*
6706                  * Non sparse file systems zero on extend, so no need
6707                  * to do that now.
6708                  */
6709                 if (!ocfs2_sparse_alloc(osb) &&
6710                     PAGE_CACHE_SIZE < osb->s_clustersize)
6711                         end = PAGE_CACHE_SIZE;
6712
6713                 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6714                 if (ret) {
6715                         mlog_errno(ret);
6716                         goto out_commit;
6717                 }
6718
6719                 /*
6720                  * This should populate the 1st page for us and mark
6721                  * it up to date.
6722                  */
6723                 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6724                 if (ret) {
6725                         mlog_errno(ret);
6726                         goto out_commit;
6727                 }
6728
6729                 page_end = PAGE_CACHE_SIZE;
6730                 if (PAGE_CACHE_SIZE > osb->s_clustersize)
6731                         page_end = osb->s_clustersize;
6732
6733                 for (i = 0; i < num_pages; i++)
6734                         ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
6735                                                  pages[i], i > 0, &phys);
6736         }
6737
6738         spin_lock(&oi->ip_lock);
6739         oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
6740         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6741         spin_unlock(&oi->ip_lock);
6742
6743         ocfs2_dinode_new_extent_list(inode, di);
6744
6745         ocfs2_journal_dirty(handle, di_bh);
6746
6747         if (has_data) {
6748                 /*
6749                  * An error at this point should be extremely rare. If
6750                  * this proves to be false, we could always re-build
6751                  * the in-inode data from our pages.
6752                  */
6753                 ocfs2_get_dinode_extent_tree(&et, inode, di_bh);
6754                 ret = ocfs2_insert_extent(osb, handle, inode, &et,
6755                                           0, block, 1, 0, NULL);
6756                 ocfs2_put_extent_tree(&et);
6757                 if (ret) {
6758                         mlog_errno(ret);
6759                         goto out_commit;
6760                 }
6761
6762                 inode->i_blocks = ocfs2_inode_sector_count(inode);
6763         }
6764
6765 out_commit:
6766         ocfs2_commit_trans(osb, handle);
6767
6768 out_unlock:
6769         if (data_ac)
6770                 ocfs2_free_alloc_context(data_ac);
6771
6772 out:
6773         if (pages) {
6774                 ocfs2_unlock_and_free_pages(pages, num_pages);
6775                 kfree(pages);
6776         }
6777
6778         return ret;
6779 }
6780
6781 /*
6782  * It is expected, that by the time you call this function,
6783  * inode->i_size and fe->i_size have been adjusted.
6784  *
6785  * WARNING: This will kfree the truncate context
6786  */
6787 int ocfs2_commit_truncate(struct ocfs2_super *osb,
6788                           struct inode *inode,
6789                           struct buffer_head *fe_bh,
6790                           struct ocfs2_truncate_context *tc)
6791 {
6792         int status, i, credits, tl_sem = 0;
6793         u32 clusters_to_del, new_highest_cpos, range;
6794         struct ocfs2_extent_list *el;
6795         handle_t *handle = NULL;
6796         struct inode *tl_inode = osb->osb_tl_inode;
6797         struct ocfs2_path *path = NULL;
6798         struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
6799
6800         mlog_entry_void();
6801
6802         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6803                                                      i_size_read(inode));
6804
6805         path = ocfs2_new_path(fe_bh, &di->id2.i_list);
6806         if (!path) {
6807                 status = -ENOMEM;
6808                 mlog_errno(status);
6809                 goto bail;
6810         }
6811
6812         ocfs2_extent_map_trunc(inode, new_highest_cpos);
6813
6814 start:
6815         /*
6816          * Check that we still have allocation to delete.
6817          */
6818         if (OCFS2_I(inode)->ip_clusters == 0) {
6819                 status = 0;
6820                 goto bail;
6821         }
6822
6823         /*
6824          * Truncate always works against the rightmost tree branch.
6825          */
6826         status = ocfs2_find_path(inode, path, UINT_MAX);
6827         if (status) {
6828                 mlog_errno(status);
6829                 goto bail;
6830         }
6831
6832         mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
6833              OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
6834
6835         /*
6836          * By now, el will point to the extent list on the bottom most
6837          * portion of this tree. Only the tail record is considered in
6838          * each pass.
6839          *
6840          * We handle the following cases, in order:
6841          * - empty extent: delete the remaining branch
6842          * - remove the entire record
6843          * - remove a partial record
6844          * - no record needs to be removed (truncate has completed)
6845          */
6846         el = path_leaf_el(path);
6847         if (le16_to_cpu(el->l_next_free_rec) == 0) {
6848                 ocfs2_error(inode->i_sb,
6849                             "Inode %llu has empty extent block at %llu\n",
6850                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
6851                             (unsigned long long)path_leaf_bh(path)->b_blocknr);
6852                 status = -EROFS;
6853                 goto bail;
6854         }
6855
6856         i = le16_to_cpu(el->l_next_free_rec) - 1;
6857         range = le32_to_cpu(el->l_recs[i].e_cpos) +
6858                 ocfs2_rec_clusters(el, &el->l_recs[i]);
6859         if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
6860                 clusters_to_del = 0;
6861         } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
6862                 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
6863         } else if (range > new_highest_cpos) {
6864                 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
6865                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
6866                                   new_highest_cpos;
6867         } else {
6868                 status = 0;
6869                 goto bail;
6870         }
6871
6872         mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
6873              clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
6874
6875         mutex_lock(&tl_inode->i_mutex);
6876         tl_sem = 1;
6877         /* ocfs2_truncate_log_needs_flush guarantees us at least one
6878          * record is free for use. If there isn't any, we flush to get
6879          * an empty truncate log.  */
6880         if (ocfs2_truncate_log_needs_flush(osb)) {
6881                 status = __ocfs2_flush_truncate_log(osb);
6882                 if (status < 0) {
6883                         mlog_errno(status);
6884                         goto bail;
6885                 }
6886         }
6887
6888         credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
6889                                                 (struct ocfs2_dinode *)fe_bh->b_data,
6890                                                 el);
6891         handle = ocfs2_start_trans(osb, credits);
6892         if (IS_ERR(handle)) {
6893                 status = PTR_ERR(handle);
6894                 handle = NULL;
6895                 mlog_errno(status);
6896                 goto bail;
6897         }
6898
6899         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
6900                                    tc, path);
6901         if (status < 0) {
6902                 mlog_errno(status);
6903                 goto bail;
6904         }
6905
6906         mutex_unlock(&tl_inode->i_mutex);
6907         tl_sem = 0;
6908
6909         ocfs2_commit_trans(osb, handle);
6910         handle = NULL;
6911
6912         ocfs2_reinit_path(path, 1);
6913
6914         /*
6915          * The check above will catch the case where we've truncated
6916          * away all allocation.
6917          */
6918         goto start;
6919
6920 bail:
6921
6922         ocfs2_schedule_truncate_log_flush(osb, 1);
6923
6924         if (tl_sem)
6925                 mutex_unlock(&tl_inode->i_mutex);
6926
6927         if (handle)
6928                 ocfs2_commit_trans(osb, handle);
6929
6930         ocfs2_run_deallocs(osb, &tc->tc_dealloc);
6931
6932         ocfs2_free_path(path);
6933
6934         /* This will drop the ext_alloc cluster lock for us */
6935         ocfs2_free_truncate_context(tc);
6936
6937         mlog_exit(status);
6938         return status;
6939 }
6940
6941 /*
6942  * Expects the inode to already be locked.
6943  */
6944 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
6945                            struct inode *inode,
6946                            struct buffer_head *fe_bh,
6947                            struct ocfs2_truncate_context **tc)
6948 {
6949         int status;
6950         unsigned int new_i_clusters;
6951         struct ocfs2_dinode *fe;
6952         struct ocfs2_extent_block *eb;
6953         struct buffer_head *last_eb_bh = NULL;
6954
6955         mlog_entry_void();
6956
6957         *tc = NULL;
6958
6959         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
6960                                                   i_size_read(inode));
6961         fe = (struct ocfs2_dinode *) fe_bh->b_data;
6962
6963         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
6964              "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
6965              (unsigned long long)le64_to_cpu(fe->i_size));
6966
6967         *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
6968         if (!(*tc)) {
6969                 status = -ENOMEM;
6970                 mlog_errno(status);
6971                 goto bail;
6972         }
6973         ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
6974
6975         if (fe->id2.i_list.l_tree_depth) {
6976                 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
6977                                           &last_eb_bh, OCFS2_BH_CACHED, inode);
6978                 if (status < 0) {
6979                         mlog_errno(status);
6980                         goto bail;
6981                 }
6982                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6983                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6984                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6985
6986                         brelse(last_eb_bh);
6987                         status = -EIO;
6988                         goto bail;
6989                 }
6990         }
6991
6992         (*tc)->tc_last_eb_bh = last_eb_bh;
6993
6994         status = 0;
6995 bail:
6996         if (status < 0) {
6997                 if (*tc)
6998                         ocfs2_free_truncate_context(*tc);
6999                 *tc = NULL;
7000         }
7001         mlog_exit_void();
7002         return status;
7003 }
7004
7005 /*
7006  * 'start' is inclusive, 'end' is not.
7007  */
7008 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7009                           unsigned int start, unsigned int end, int trunc)
7010 {
7011         int ret;
7012         unsigned int numbytes;
7013         handle_t *handle;
7014         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7015         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7016         struct ocfs2_inline_data *idata = &di->id2.i_data;
7017
7018         if (end > i_size_read(inode))
7019                 end = i_size_read(inode);
7020
7021         BUG_ON(start >= end);
7022
7023         if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7024             !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7025             !ocfs2_supports_inline_data(osb)) {
7026                 ocfs2_error(inode->i_sb,
7027                             "Inline data flags for inode %llu don't agree! "
7028                             "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7029                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7030                             le16_to_cpu(di->i_dyn_features),
7031                             OCFS2_I(inode)->ip_dyn_features,
7032                             osb->s_feature_incompat);
7033                 ret = -EROFS;
7034                 goto out;
7035         }
7036
7037         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7038         if (IS_ERR(handle)) {
7039                 ret = PTR_ERR(handle);
7040                 mlog_errno(ret);
7041                 goto out;
7042         }
7043
7044         ret = ocfs2_journal_access(handle, inode, di_bh,
7045                                    OCFS2_JOURNAL_ACCESS_WRITE);
7046         if (ret) {
7047                 mlog_errno(ret);
7048                 goto out_commit;
7049         }
7050
7051         numbytes = end - start;
7052         memset(idata->id_data + start, 0, numbytes);
7053
7054         /*
7055          * No need to worry about the data page here - it's been
7056          * truncated already and inline data doesn't need it for
7057          * pushing zero's to disk, so we'll let readpage pick it up
7058          * later.
7059          */
7060         if (trunc) {
7061                 i_size_write(inode, start);
7062                 di->i_size = cpu_to_le64(start);
7063         }
7064
7065         inode->i_blocks = ocfs2_inode_sector_count(inode);
7066         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7067
7068         di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7069         di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7070
7071         ocfs2_journal_dirty(handle, di_bh);
7072
7073 out_commit:
7074         ocfs2_commit_trans(osb, handle);
7075
7076 out:
7077         return ret;
7078 }
7079
7080 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7081 {
7082         /*
7083          * The caller is responsible for completing deallocation
7084          * before freeing the context.
7085          */
7086         if (tc->tc_dealloc.c_first_suballocator != NULL)
7087                 mlog(ML_NOTICE,
7088                      "Truncate completion has non-empty dealloc context\n");
7089
7090         if (tc->tc_last_eb_bh)
7091                 brelse(tc->tc_last_eb_bh);
7092
7093         kfree(tc);
7094 }