ocfs2: Add xattr index tree operations
[safe/jmp/linux-2.6] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "aops.h"
39 #include "dlmglue.h"
40 #include "extent_map.h"
41 #include "inode.h"
42 #include "journal.h"
43 #include "localalloc.h"
44 #include "suballoc.h"
45 #include "sysfile.h"
46 #include "file.h"
47 #include "super.h"
48 #include "uptodate.h"
49
50 #include "buffer_head_io.h"
51
52 /*
53  * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
54  * the b-tree operations in ocfs2. Now all the b-tree operations are not
55  * limited to ocfs2_dinode only. Any data which need to allocate clusters
56  * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
57  * and operation.
58  *
59  * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
60  * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
61  * functions.
62  * ocfs2_extent_tree_operations abstract the normal operations we do for
63  * the root of extent b-tree.
64  */
65 struct ocfs2_extent_tree;
66
67 struct ocfs2_extent_tree_operations {
68         void (*set_last_eb_blk) (struct ocfs2_extent_tree *et, u64 blkno);
69         u64 (*get_last_eb_blk) (struct ocfs2_extent_tree *et);
70         void (*update_clusters) (struct inode *inode,
71                                  struct ocfs2_extent_tree *et,
72                                  u32 new_clusters);
73         int (*sanity_check) (struct inode *inode, struct ocfs2_extent_tree *et);
74 };
75
76 struct ocfs2_extent_tree {
77         enum ocfs2_extent_tree_type type;
78         struct ocfs2_extent_tree_operations *eops;
79         struct buffer_head *root_bh;
80         struct ocfs2_extent_list *root_el;
81         void *private;
82 };
83
84 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
85                                          u64 blkno)
86 {
87         struct ocfs2_dinode *di = (struct ocfs2_dinode *)et->root_bh->b_data;
88
89         BUG_ON(et->type != OCFS2_DINODE_EXTENT);
90         di->i_last_eb_blk = cpu_to_le64(blkno);
91 }
92
93 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
94 {
95         struct ocfs2_dinode *di = (struct ocfs2_dinode *)et->root_bh->b_data;
96
97         BUG_ON(et->type != OCFS2_DINODE_EXTENT);
98         return le64_to_cpu(di->i_last_eb_blk);
99 }
100
101 static void ocfs2_dinode_update_clusters(struct inode *inode,
102                                          struct ocfs2_extent_tree *et,
103                                          u32 clusters)
104 {
105         struct ocfs2_dinode *di =
106                         (struct ocfs2_dinode *)et->root_bh->b_data;
107
108         le32_add_cpu(&di->i_clusters, clusters);
109         spin_lock(&OCFS2_I(inode)->ip_lock);
110         OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
111         spin_unlock(&OCFS2_I(inode)->ip_lock);
112 }
113
114 static int ocfs2_dinode_sanity_check(struct inode *inode,
115                                      struct ocfs2_extent_tree *et)
116 {
117         int ret = 0;
118         struct ocfs2_dinode *di;
119
120         BUG_ON(et->type != OCFS2_DINODE_EXTENT);
121
122         di = (struct ocfs2_dinode *)et->root_bh->b_data;
123         if (!OCFS2_IS_VALID_DINODE(di)) {
124                 ret = -EIO;
125                 ocfs2_error(inode->i_sb,
126                         "Inode %llu has invalid path root",
127                         (unsigned long long)OCFS2_I(inode)->ip_blkno);
128         }
129
130         return ret;
131 }
132
133 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
134         .set_last_eb_blk        = ocfs2_dinode_set_last_eb_blk,
135         .get_last_eb_blk        = ocfs2_dinode_get_last_eb_blk,
136         .update_clusters        = ocfs2_dinode_update_clusters,
137         .sanity_check           = ocfs2_dinode_sanity_check,
138 };
139
140 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
141                                               u64 blkno)
142 {
143         struct ocfs2_xattr_value_root *xv =
144                 (struct ocfs2_xattr_value_root *)et->private;
145
146         xv->xr_last_eb_blk = cpu_to_le64(blkno);
147 }
148
149 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
150 {
151         struct ocfs2_xattr_value_root *xv =
152                 (struct ocfs2_xattr_value_root *) et->private;
153
154         return le64_to_cpu(xv->xr_last_eb_blk);
155 }
156
157 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
158                                               struct ocfs2_extent_tree *et,
159                                               u32 clusters)
160 {
161         struct ocfs2_xattr_value_root *xv =
162                 (struct ocfs2_xattr_value_root *)et->private;
163
164         le32_add_cpu(&xv->xr_clusters, clusters);
165 }
166
167 static int ocfs2_xattr_value_sanity_check(struct inode *inode,
168                                           struct ocfs2_extent_tree *et)
169 {
170         return 0;
171 }
172
173 static struct ocfs2_extent_tree_operations ocfs2_xattr_et_ops = {
174         .set_last_eb_blk        = ocfs2_xattr_value_set_last_eb_blk,
175         .get_last_eb_blk        = ocfs2_xattr_value_get_last_eb_blk,
176         .update_clusters        = ocfs2_xattr_value_update_clusters,
177         .sanity_check           = ocfs2_xattr_value_sanity_check,
178 };
179
180 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
181                                              u64 blkno)
182 {
183         struct ocfs2_xattr_block *xb =
184                 (struct ocfs2_xattr_block *) et->root_bh->b_data;
185         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
186
187         xt->xt_last_eb_blk = cpu_to_le64(blkno);
188 }
189
190 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
191 {
192         struct ocfs2_xattr_block *xb =
193                 (struct ocfs2_xattr_block *) et->root_bh->b_data;
194         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
195
196         return le64_to_cpu(xt->xt_last_eb_blk);
197 }
198
199 static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
200                                              struct ocfs2_extent_tree *et,
201                                              u32 clusters)
202 {
203         struct ocfs2_xattr_block *xb =
204                         (struct ocfs2_xattr_block *)et->root_bh->b_data;
205
206         le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
207 }
208
209 static int ocfs2_xattr_tree_sanity_check(struct inode *inode,
210                                          struct ocfs2_extent_tree *et)
211 {
212         return 0;
213 }
214
215 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
216         .set_last_eb_blk        = ocfs2_xattr_tree_set_last_eb_blk,
217         .get_last_eb_blk        = ocfs2_xattr_tree_get_last_eb_blk,
218         .update_clusters        = ocfs2_xattr_tree_update_clusters,
219         .sanity_check           = ocfs2_xattr_tree_sanity_check,
220 };
221
222 static struct ocfs2_extent_tree*
223          ocfs2_new_extent_tree(struct buffer_head *bh,
224                                enum ocfs2_extent_tree_type et_type,
225                                void *private)
226 {
227         struct ocfs2_extent_tree *et;
228
229         et = kzalloc(sizeof(*et), GFP_NOFS);
230         if (!et)
231                 return NULL;
232
233         et->type = et_type;
234         get_bh(bh);
235         et->root_bh = bh;
236         et->private = private;
237
238         if (et_type == OCFS2_DINODE_EXTENT) {
239                 et->root_el = &((struct ocfs2_dinode *)bh->b_data)->id2.i_list;
240                 et->eops = &ocfs2_dinode_et_ops;
241         } else if (et_type == OCFS2_XATTR_VALUE_EXTENT) {
242                 struct ocfs2_xattr_value_root *xv =
243                         (struct ocfs2_xattr_value_root *) private;
244                 et->root_el = &xv->xr_list;
245                 et->eops = &ocfs2_xattr_et_ops;
246         } else if (et_type == OCFS2_XATTR_TREE_EXTENT) {
247                 struct ocfs2_xattr_block *xb =
248                         (struct ocfs2_xattr_block *)bh->b_data;
249                 et->root_el = &xb->xb_attrs.xb_root.xt_list;
250                 et->eops = &ocfs2_xattr_tree_et_ops;
251         }
252
253         return et;
254 }
255
256 static void ocfs2_free_extent_tree(struct ocfs2_extent_tree *et)
257 {
258         if (et) {
259                 brelse(et->root_bh);
260                 kfree(et);
261         }
262 }
263
264 static inline void ocfs2_set_last_eb_blk(struct ocfs2_extent_tree *et,
265                                          u64 new_last_eb_blk)
266 {
267         et->eops->set_last_eb_blk(et, new_last_eb_blk);
268 }
269
270 static inline u64 ocfs2_get_last_eb_blk(struct ocfs2_extent_tree *et)
271 {
272         return et->eops->get_last_eb_blk(et);
273 }
274
275 static inline void ocfs2_update_clusters(struct inode *inode,
276                                          struct ocfs2_extent_tree *et,
277                                          u32 clusters)
278 {
279         et->eops->update_clusters(inode, et, clusters);
280 }
281
282 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
283 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
284                                          struct ocfs2_extent_block *eb);
285
286 /*
287  * Structures which describe a path through a btree, and functions to
288  * manipulate them.
289  *
290  * The idea here is to be as generic as possible with the tree
291  * manipulation code.
292  */
293 struct ocfs2_path_item {
294         struct buffer_head              *bh;
295         struct ocfs2_extent_list        *el;
296 };
297
298 #define OCFS2_MAX_PATH_DEPTH    5
299
300 struct ocfs2_path {
301         int                     p_tree_depth;
302         struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
303 };
304
305 #define path_root_bh(_path) ((_path)->p_node[0].bh)
306 #define path_root_el(_path) ((_path)->p_node[0].el)
307 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
308 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
309 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
310
311 /*
312  * Reset the actual path elements so that we can re-use the structure
313  * to build another path. Generally, this involves freeing the buffer
314  * heads.
315  */
316 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
317 {
318         int i, start = 0, depth = 0;
319         struct ocfs2_path_item *node;
320
321         if (keep_root)
322                 start = 1;
323
324         for(i = start; i < path_num_items(path); i++) {
325                 node = &path->p_node[i];
326
327                 brelse(node->bh);
328                 node->bh = NULL;
329                 node->el = NULL;
330         }
331
332         /*
333          * Tree depth may change during truncate, or insert. If we're
334          * keeping the root extent list, then make sure that our path
335          * structure reflects the proper depth.
336          */
337         if (keep_root)
338                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
339
340         path->p_tree_depth = depth;
341 }
342
343 static void ocfs2_free_path(struct ocfs2_path *path)
344 {
345         if (path) {
346                 ocfs2_reinit_path(path, 0);
347                 kfree(path);
348         }
349 }
350
351 /*
352  * All the elements of src into dest. After this call, src could be freed
353  * without affecting dest.
354  *
355  * Both paths should have the same root. Any non-root elements of dest
356  * will be freed.
357  */
358 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
359 {
360         int i;
361
362         BUG_ON(path_root_bh(dest) != path_root_bh(src));
363         BUG_ON(path_root_el(dest) != path_root_el(src));
364
365         ocfs2_reinit_path(dest, 1);
366
367         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
368                 dest->p_node[i].bh = src->p_node[i].bh;
369                 dest->p_node[i].el = src->p_node[i].el;
370
371                 if (dest->p_node[i].bh)
372                         get_bh(dest->p_node[i].bh);
373         }
374 }
375
376 /*
377  * Make the *dest path the same as src and re-initialize src path to
378  * have a root only.
379  */
380 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
381 {
382         int i;
383
384         BUG_ON(path_root_bh(dest) != path_root_bh(src));
385
386         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
387                 brelse(dest->p_node[i].bh);
388
389                 dest->p_node[i].bh = src->p_node[i].bh;
390                 dest->p_node[i].el = src->p_node[i].el;
391
392                 src->p_node[i].bh = NULL;
393                 src->p_node[i].el = NULL;
394         }
395 }
396
397 /*
398  * Insert an extent block at given index.
399  *
400  * This will not take an additional reference on eb_bh.
401  */
402 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
403                                         struct buffer_head *eb_bh)
404 {
405         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
406
407         /*
408          * Right now, no root bh is an extent block, so this helps
409          * catch code errors with dinode trees. The assertion can be
410          * safely removed if we ever need to insert extent block
411          * structures at the root.
412          */
413         BUG_ON(index == 0);
414
415         path->p_node[index].bh = eb_bh;
416         path->p_node[index].el = &eb->h_list;
417 }
418
419 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
420                                          struct ocfs2_extent_list *root_el)
421 {
422         struct ocfs2_path *path;
423
424         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
425
426         path = kzalloc(sizeof(*path), GFP_NOFS);
427         if (path) {
428                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
429                 get_bh(root_bh);
430                 path_root_bh(path) = root_bh;
431                 path_root_el(path) = root_el;
432         }
433
434         return path;
435 }
436
437 /*
438  * Convenience function to journal all components in a path.
439  */
440 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
441                                      struct ocfs2_path *path)
442 {
443         int i, ret = 0;
444
445         if (!path)
446                 goto out;
447
448         for(i = 0; i < path_num_items(path); i++) {
449                 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
450                                            OCFS2_JOURNAL_ACCESS_WRITE);
451                 if (ret < 0) {
452                         mlog_errno(ret);
453                         goto out;
454                 }
455         }
456
457 out:
458         return ret;
459 }
460
461 /*
462  * Return the index of the extent record which contains cluster #v_cluster.
463  * -1 is returned if it was not found.
464  *
465  * Should work fine on interior and exterior nodes.
466  */
467 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
468 {
469         int ret = -1;
470         int i;
471         struct ocfs2_extent_rec *rec;
472         u32 rec_end, rec_start, clusters;
473
474         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
475                 rec = &el->l_recs[i];
476
477                 rec_start = le32_to_cpu(rec->e_cpos);
478                 clusters = ocfs2_rec_clusters(el, rec);
479
480                 rec_end = rec_start + clusters;
481
482                 if (v_cluster >= rec_start && v_cluster < rec_end) {
483                         ret = i;
484                         break;
485                 }
486         }
487
488         return ret;
489 }
490
491 enum ocfs2_contig_type {
492         CONTIG_NONE = 0,
493         CONTIG_LEFT,
494         CONTIG_RIGHT,
495         CONTIG_LEFTRIGHT,
496 };
497
498
499 /*
500  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
501  * ocfs2_extent_contig only work properly against leaf nodes!
502  */
503 static int ocfs2_block_extent_contig(struct super_block *sb,
504                                      struct ocfs2_extent_rec *ext,
505                                      u64 blkno)
506 {
507         u64 blk_end = le64_to_cpu(ext->e_blkno);
508
509         blk_end += ocfs2_clusters_to_blocks(sb,
510                                     le16_to_cpu(ext->e_leaf_clusters));
511
512         return blkno == blk_end;
513 }
514
515 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
516                                   struct ocfs2_extent_rec *right)
517 {
518         u32 left_range;
519
520         left_range = le32_to_cpu(left->e_cpos) +
521                 le16_to_cpu(left->e_leaf_clusters);
522
523         return (left_range == le32_to_cpu(right->e_cpos));
524 }
525
526 static enum ocfs2_contig_type
527         ocfs2_extent_contig(struct inode *inode,
528                             struct ocfs2_extent_rec *ext,
529                             struct ocfs2_extent_rec *insert_rec)
530 {
531         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
532
533         /*
534          * Refuse to coalesce extent records with different flag
535          * fields - we don't want to mix unwritten extents with user
536          * data.
537          */
538         if (ext->e_flags != insert_rec->e_flags)
539                 return CONTIG_NONE;
540
541         if (ocfs2_extents_adjacent(ext, insert_rec) &&
542             ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
543                         return CONTIG_RIGHT;
544
545         blkno = le64_to_cpu(ext->e_blkno);
546         if (ocfs2_extents_adjacent(insert_rec, ext) &&
547             ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
548                 return CONTIG_LEFT;
549
550         return CONTIG_NONE;
551 }
552
553 /*
554  * NOTE: We can have pretty much any combination of contiguousness and
555  * appending.
556  *
557  * The usefulness of APPEND_TAIL is more in that it lets us know that
558  * we'll have to update the path to that leaf.
559  */
560 enum ocfs2_append_type {
561         APPEND_NONE = 0,
562         APPEND_TAIL,
563 };
564
565 enum ocfs2_split_type {
566         SPLIT_NONE = 0,
567         SPLIT_LEFT,
568         SPLIT_RIGHT,
569 };
570
571 struct ocfs2_insert_type {
572         enum ocfs2_split_type   ins_split;
573         enum ocfs2_append_type  ins_appending;
574         enum ocfs2_contig_type  ins_contig;
575         int                     ins_contig_index;
576         int                     ins_tree_depth;
577 };
578
579 struct ocfs2_merge_ctxt {
580         enum ocfs2_contig_type  c_contig_type;
581         int                     c_has_empty_extent;
582         int                     c_split_covers_rec;
583 };
584
585 /*
586  * How many free extents have we got before we need more meta data?
587  */
588 int ocfs2_num_free_extents(struct ocfs2_super *osb,
589                            struct inode *inode,
590                            struct buffer_head *root_bh,
591                            enum ocfs2_extent_tree_type type,
592                            void *private)
593 {
594         int retval;
595         struct ocfs2_extent_list *el = NULL;
596         struct ocfs2_extent_block *eb;
597         struct buffer_head *eb_bh = NULL;
598         u64 last_eb_blk = 0;
599
600         mlog_entry_void();
601
602         if (type == OCFS2_DINODE_EXTENT) {
603                 struct ocfs2_dinode *fe =
604                                 (struct ocfs2_dinode *)root_bh->b_data;
605                 if (!OCFS2_IS_VALID_DINODE(fe)) {
606                         OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
607                         retval = -EIO;
608                         goto bail;
609                 }
610
611                 if (fe->i_last_eb_blk)
612                         last_eb_blk = le64_to_cpu(fe->i_last_eb_blk);
613                 el = &fe->id2.i_list;
614         } else if (type == OCFS2_XATTR_VALUE_EXTENT) {
615                 struct ocfs2_xattr_value_root *xv =
616                         (struct ocfs2_xattr_value_root *) private;
617
618                 last_eb_blk = le64_to_cpu(xv->xr_last_eb_blk);
619                 el = &xv->xr_list;
620         } else if (type == OCFS2_XATTR_TREE_EXTENT) {
621                 struct ocfs2_xattr_block *xb =
622                         (struct ocfs2_xattr_block *)root_bh->b_data;
623
624                 last_eb_blk = le64_to_cpu(xb->xb_attrs.xb_root.xt_last_eb_blk);
625                 el = &xb->xb_attrs.xb_root.xt_list;
626         }
627
628         if (last_eb_blk) {
629                 retval = ocfs2_read_block(osb, last_eb_blk,
630                                           &eb_bh, OCFS2_BH_CACHED, inode);
631                 if (retval < 0) {
632                         mlog_errno(retval);
633                         goto bail;
634                 }
635                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
636                 el = &eb->h_list;
637         }
638
639         BUG_ON(el->l_tree_depth != 0);
640
641         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
642 bail:
643         if (eb_bh)
644                 brelse(eb_bh);
645
646         mlog_exit(retval);
647         return retval;
648 }
649
650 /* expects array to already be allocated
651  *
652  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
653  * l_count for you
654  */
655 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
656                                      handle_t *handle,
657                                      struct inode *inode,
658                                      int wanted,
659                                      struct ocfs2_alloc_context *meta_ac,
660                                      struct buffer_head *bhs[])
661 {
662         int count, status, i;
663         u16 suballoc_bit_start;
664         u32 num_got;
665         u64 first_blkno;
666         struct ocfs2_extent_block *eb;
667
668         mlog_entry_void();
669
670         count = 0;
671         while (count < wanted) {
672                 status = ocfs2_claim_metadata(osb,
673                                               handle,
674                                               meta_ac,
675                                               wanted - count,
676                                               &suballoc_bit_start,
677                                               &num_got,
678                                               &first_blkno);
679                 if (status < 0) {
680                         mlog_errno(status);
681                         goto bail;
682                 }
683
684                 for(i = count;  i < (num_got + count); i++) {
685                         bhs[i] = sb_getblk(osb->sb, first_blkno);
686                         if (bhs[i] == NULL) {
687                                 status = -EIO;
688                                 mlog_errno(status);
689                                 goto bail;
690                         }
691                         ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
692
693                         status = ocfs2_journal_access(handle, inode, bhs[i],
694                                                       OCFS2_JOURNAL_ACCESS_CREATE);
695                         if (status < 0) {
696                                 mlog_errno(status);
697                                 goto bail;
698                         }
699
700                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
701                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
702                         /* Ok, setup the minimal stuff here. */
703                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
704                         eb->h_blkno = cpu_to_le64(first_blkno);
705                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
706                         eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
707                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
708                         eb->h_list.l_count =
709                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
710
711                         suballoc_bit_start++;
712                         first_blkno++;
713
714                         /* We'll also be dirtied by the caller, so
715                          * this isn't absolutely necessary. */
716                         status = ocfs2_journal_dirty(handle, bhs[i]);
717                         if (status < 0) {
718                                 mlog_errno(status);
719                                 goto bail;
720                         }
721                 }
722
723                 count += num_got;
724         }
725
726         status = 0;
727 bail:
728         if (status < 0) {
729                 for(i = 0; i < wanted; i++) {
730                         if (bhs[i])
731                                 brelse(bhs[i]);
732                         bhs[i] = NULL;
733                 }
734         }
735         mlog_exit(status);
736         return status;
737 }
738
739 /*
740  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
741  *
742  * Returns the sum of the rightmost extent rec logical offset and
743  * cluster count.
744  *
745  * ocfs2_add_branch() uses this to determine what logical cluster
746  * value should be populated into the leftmost new branch records.
747  *
748  * ocfs2_shift_tree_depth() uses this to determine the # clusters
749  * value for the new topmost tree record.
750  */
751 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
752 {
753         int i;
754
755         i = le16_to_cpu(el->l_next_free_rec) - 1;
756
757         return le32_to_cpu(el->l_recs[i].e_cpos) +
758                 ocfs2_rec_clusters(el, &el->l_recs[i]);
759 }
760
761 /*
762  * Add an entire tree branch to our inode. eb_bh is the extent block
763  * to start at, if we don't want to start the branch at the dinode
764  * structure.
765  *
766  * last_eb_bh is required as we have to update it's next_leaf pointer
767  * for the new last extent block.
768  *
769  * the new branch will be 'empty' in the sense that every block will
770  * contain a single record with cluster count == 0.
771  */
772 static int ocfs2_add_branch(struct ocfs2_super *osb,
773                             handle_t *handle,
774                             struct inode *inode,
775                             struct ocfs2_extent_tree *et,
776                             struct buffer_head *eb_bh,
777                             struct buffer_head **last_eb_bh,
778                             struct ocfs2_alloc_context *meta_ac)
779 {
780         int status, new_blocks, i;
781         u64 next_blkno, new_last_eb_blk;
782         struct buffer_head *bh;
783         struct buffer_head **new_eb_bhs = NULL;
784         struct ocfs2_extent_block *eb;
785         struct ocfs2_extent_list  *eb_el;
786         struct ocfs2_extent_list  *el;
787         u32 new_cpos;
788
789         mlog_entry_void();
790
791         BUG_ON(!last_eb_bh || !*last_eb_bh);
792
793         if (eb_bh) {
794                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
795                 el = &eb->h_list;
796         } else
797                 el = et->root_el;
798
799         /* we never add a branch to a leaf. */
800         BUG_ON(!el->l_tree_depth);
801
802         new_blocks = le16_to_cpu(el->l_tree_depth);
803
804         /* allocate the number of new eb blocks we need */
805         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
806                              GFP_KERNEL);
807         if (!new_eb_bhs) {
808                 status = -ENOMEM;
809                 mlog_errno(status);
810                 goto bail;
811         }
812
813         status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
814                                            meta_ac, new_eb_bhs);
815         if (status < 0) {
816                 mlog_errno(status);
817                 goto bail;
818         }
819
820         eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
821         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
822
823         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
824          * linked with the rest of the tree.
825          * conversly, new_eb_bhs[0] is the new bottommost leaf.
826          *
827          * when we leave the loop, new_last_eb_blk will point to the
828          * newest leaf, and next_blkno will point to the topmost extent
829          * block. */
830         next_blkno = new_last_eb_blk = 0;
831         for(i = 0; i < new_blocks; i++) {
832                 bh = new_eb_bhs[i];
833                 eb = (struct ocfs2_extent_block *) bh->b_data;
834                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
835                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
836                         status = -EIO;
837                         goto bail;
838                 }
839                 eb_el = &eb->h_list;
840
841                 status = ocfs2_journal_access(handle, inode, bh,
842                                               OCFS2_JOURNAL_ACCESS_CREATE);
843                 if (status < 0) {
844                         mlog_errno(status);
845                         goto bail;
846                 }
847
848                 eb->h_next_leaf_blk = 0;
849                 eb_el->l_tree_depth = cpu_to_le16(i);
850                 eb_el->l_next_free_rec = cpu_to_le16(1);
851                 /*
852                  * This actually counts as an empty extent as
853                  * c_clusters == 0
854                  */
855                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
856                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
857                 /*
858                  * eb_el isn't always an interior node, but even leaf
859                  * nodes want a zero'd flags and reserved field so
860                  * this gets the whole 32 bits regardless of use.
861                  */
862                 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
863                 if (!eb_el->l_tree_depth)
864                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
865
866                 status = ocfs2_journal_dirty(handle, bh);
867                 if (status < 0) {
868                         mlog_errno(status);
869                         goto bail;
870                 }
871
872                 next_blkno = le64_to_cpu(eb->h_blkno);
873         }
874
875         /* This is a bit hairy. We want to update up to three blocks
876          * here without leaving any of them in an inconsistent state
877          * in case of error. We don't have to worry about
878          * journal_dirty erroring as it won't unless we've aborted the
879          * handle (in which case we would never be here) so reserving
880          * the write with journal_access is all we need to do. */
881         status = ocfs2_journal_access(handle, inode, *last_eb_bh,
882                                       OCFS2_JOURNAL_ACCESS_WRITE);
883         if (status < 0) {
884                 mlog_errno(status);
885                 goto bail;
886         }
887         status = ocfs2_journal_access(handle, inode, et->root_bh,
888                                       OCFS2_JOURNAL_ACCESS_WRITE);
889         if (status < 0) {
890                 mlog_errno(status);
891                 goto bail;
892         }
893         if (eb_bh) {
894                 status = ocfs2_journal_access(handle, inode, eb_bh,
895                                               OCFS2_JOURNAL_ACCESS_WRITE);
896                 if (status < 0) {
897                         mlog_errno(status);
898                         goto bail;
899                 }
900         }
901
902         /* Link the new branch into the rest of the tree (el will
903          * either be on the root_bh, or the extent block passed in. */
904         i = le16_to_cpu(el->l_next_free_rec);
905         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
906         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
907         el->l_recs[i].e_int_clusters = 0;
908         le16_add_cpu(&el->l_next_free_rec, 1);
909
910         /* fe needs a new last extent block pointer, as does the
911          * next_leaf on the previously last-extent-block. */
912         ocfs2_set_last_eb_blk(et, new_last_eb_blk);
913
914         eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
915         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
916
917         status = ocfs2_journal_dirty(handle, *last_eb_bh);
918         if (status < 0)
919                 mlog_errno(status);
920         status = ocfs2_journal_dirty(handle, et->root_bh);
921         if (status < 0)
922                 mlog_errno(status);
923         if (eb_bh) {
924                 status = ocfs2_journal_dirty(handle, eb_bh);
925                 if (status < 0)
926                         mlog_errno(status);
927         }
928
929         /*
930          * Some callers want to track the rightmost leaf so pass it
931          * back here.
932          */
933         brelse(*last_eb_bh);
934         get_bh(new_eb_bhs[0]);
935         *last_eb_bh = new_eb_bhs[0];
936
937         status = 0;
938 bail:
939         if (new_eb_bhs) {
940                 for (i = 0; i < new_blocks; i++)
941                         if (new_eb_bhs[i])
942                                 brelse(new_eb_bhs[i]);
943                 kfree(new_eb_bhs);
944         }
945
946         mlog_exit(status);
947         return status;
948 }
949
950 /*
951  * adds another level to the allocation tree.
952  * returns back the new extent block so you can add a branch to it
953  * after this call.
954  */
955 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
956                                   handle_t *handle,
957                                   struct inode *inode,
958                                   struct ocfs2_extent_tree *et,
959                                   struct ocfs2_alloc_context *meta_ac,
960                                   struct buffer_head **ret_new_eb_bh)
961 {
962         int status, i;
963         u32 new_clusters;
964         struct buffer_head *new_eb_bh = NULL;
965         struct ocfs2_extent_block *eb;
966         struct ocfs2_extent_list  *root_el;
967         struct ocfs2_extent_list  *eb_el;
968
969         mlog_entry_void();
970
971         status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
972                                            &new_eb_bh);
973         if (status < 0) {
974                 mlog_errno(status);
975                 goto bail;
976         }
977
978         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
979         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
980                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
981                 status = -EIO;
982                 goto bail;
983         }
984
985         eb_el = &eb->h_list;
986         root_el = et->root_el;
987
988         status = ocfs2_journal_access(handle, inode, new_eb_bh,
989                                       OCFS2_JOURNAL_ACCESS_CREATE);
990         if (status < 0) {
991                 mlog_errno(status);
992                 goto bail;
993         }
994
995         /* copy the root extent list data into the new extent block */
996         eb_el->l_tree_depth = root_el->l_tree_depth;
997         eb_el->l_next_free_rec = root_el->l_next_free_rec;
998         for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
999                 eb_el->l_recs[i] = root_el->l_recs[i];
1000
1001         status = ocfs2_journal_dirty(handle, new_eb_bh);
1002         if (status < 0) {
1003                 mlog_errno(status);
1004                 goto bail;
1005         }
1006
1007         status = ocfs2_journal_access(handle, inode, et->root_bh,
1008                                       OCFS2_JOURNAL_ACCESS_WRITE);
1009         if (status < 0) {
1010                 mlog_errno(status);
1011                 goto bail;
1012         }
1013
1014         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1015
1016         /* update root_bh now */
1017         le16_add_cpu(&root_el->l_tree_depth, 1);
1018         root_el->l_recs[0].e_cpos = 0;
1019         root_el->l_recs[0].e_blkno = eb->h_blkno;
1020         root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1021         for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1022                 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1023         root_el->l_next_free_rec = cpu_to_le16(1);
1024
1025         /* If this is our 1st tree depth shift, then last_eb_blk
1026          * becomes the allocated extent block */
1027         if (root_el->l_tree_depth == cpu_to_le16(1))
1028                 ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1029
1030         status = ocfs2_journal_dirty(handle, et->root_bh);
1031         if (status < 0) {
1032                 mlog_errno(status);
1033                 goto bail;
1034         }
1035
1036         *ret_new_eb_bh = new_eb_bh;
1037         new_eb_bh = NULL;
1038         status = 0;
1039 bail:
1040         if (new_eb_bh)
1041                 brelse(new_eb_bh);
1042
1043         mlog_exit(status);
1044         return status;
1045 }
1046
1047 /*
1048  * Should only be called when there is no space left in any of the
1049  * leaf nodes. What we want to do is find the lowest tree depth
1050  * non-leaf extent block with room for new records. There are three
1051  * valid results of this search:
1052  *
1053  * 1) a lowest extent block is found, then we pass it back in
1054  *    *lowest_eb_bh and return '0'
1055  *
1056  * 2) the search fails to find anything, but the root_el has room. We
1057  *    pass NULL back in *lowest_eb_bh, but still return '0'
1058  *
1059  * 3) the search fails to find anything AND the root_el is full, in
1060  *    which case we return > 0
1061  *
1062  * return status < 0 indicates an error.
1063  */
1064 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1065                                     struct inode *inode,
1066                                     struct ocfs2_extent_tree *et,
1067                                     struct buffer_head **target_bh)
1068 {
1069         int status = 0, i;
1070         u64 blkno;
1071         struct ocfs2_extent_block *eb;
1072         struct ocfs2_extent_list  *el;
1073         struct buffer_head *bh = NULL;
1074         struct buffer_head *lowest_bh = NULL;
1075
1076         mlog_entry_void();
1077
1078         *target_bh = NULL;
1079
1080         el = et->root_el;
1081
1082         while(le16_to_cpu(el->l_tree_depth) > 1) {
1083                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1084                         ocfs2_error(inode->i_sb, "Dinode %llu has empty "
1085                                     "extent list (next_free_rec == 0)",
1086                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
1087                         status = -EIO;
1088                         goto bail;
1089                 }
1090                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1091                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1092                 if (!blkno) {
1093                         ocfs2_error(inode->i_sb, "Dinode %llu has extent "
1094                                     "list where extent # %d has no physical "
1095                                     "block start",
1096                                     (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
1097                         status = -EIO;
1098                         goto bail;
1099                 }
1100
1101                 if (bh) {
1102                         brelse(bh);
1103                         bh = NULL;
1104                 }
1105
1106                 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
1107                                           inode);
1108                 if (status < 0) {
1109                         mlog_errno(status);
1110                         goto bail;
1111                 }
1112
1113                 eb = (struct ocfs2_extent_block *) bh->b_data;
1114                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1115                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1116                         status = -EIO;
1117                         goto bail;
1118                 }
1119                 el = &eb->h_list;
1120
1121                 if (le16_to_cpu(el->l_next_free_rec) <
1122                     le16_to_cpu(el->l_count)) {
1123                         if (lowest_bh)
1124                                 brelse(lowest_bh);
1125                         lowest_bh = bh;
1126                         get_bh(lowest_bh);
1127                 }
1128         }
1129
1130         /* If we didn't find one and the fe doesn't have any room,
1131          * then return '1' */
1132         el = et->root_el;
1133         if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1134                 status = 1;
1135
1136         *target_bh = lowest_bh;
1137 bail:
1138         if (bh)
1139                 brelse(bh);
1140
1141         mlog_exit(status);
1142         return status;
1143 }
1144
1145 /*
1146  * Grow a b-tree so that it has more records.
1147  *
1148  * We might shift the tree depth in which case existing paths should
1149  * be considered invalid.
1150  *
1151  * Tree depth after the grow is returned via *final_depth.
1152  *
1153  * *last_eb_bh will be updated by ocfs2_add_branch().
1154  */
1155 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1156                            struct ocfs2_extent_tree *et, int *final_depth,
1157                            struct buffer_head **last_eb_bh,
1158                            struct ocfs2_alloc_context *meta_ac)
1159 {
1160         int ret, shift;
1161         struct ocfs2_extent_list *el = et->root_el;
1162         int depth = le16_to_cpu(el->l_tree_depth);
1163         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1164         struct buffer_head *bh = NULL;
1165
1166         BUG_ON(meta_ac == NULL);
1167
1168         shift = ocfs2_find_branch_target(osb, inode, et, &bh);
1169         if (shift < 0) {
1170                 ret = shift;
1171                 mlog_errno(ret);
1172                 goto out;
1173         }
1174
1175         /* We traveled all the way to the bottom of the allocation tree
1176          * and didn't find room for any more extents - we need to add
1177          * another tree level */
1178         if (shift) {
1179                 BUG_ON(bh);
1180                 mlog(0, "need to shift tree depth (current = %d)\n", depth);
1181
1182                 /* ocfs2_shift_tree_depth will return us a buffer with
1183                  * the new extent block (so we can pass that to
1184                  * ocfs2_add_branch). */
1185                 ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
1186                                              meta_ac, &bh);
1187                 if (ret < 0) {
1188                         mlog_errno(ret);
1189                         goto out;
1190                 }
1191                 depth++;
1192                 if (depth == 1) {
1193                         /*
1194                          * Special case: we have room now if we shifted from
1195                          * tree_depth 0, so no more work needs to be done.
1196                          *
1197                          * We won't be calling add_branch, so pass
1198                          * back *last_eb_bh as the new leaf. At depth
1199                          * zero, it should always be null so there's
1200                          * no reason to brelse.
1201                          */
1202                         BUG_ON(*last_eb_bh);
1203                         get_bh(bh);
1204                         *last_eb_bh = bh;
1205                         goto out;
1206                 }
1207         }
1208
1209         /* call ocfs2_add_branch to add the final part of the tree with
1210          * the new data. */
1211         mlog(0, "add branch. bh = %p\n", bh);
1212         ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
1213                                meta_ac);
1214         if (ret < 0) {
1215                 mlog_errno(ret);
1216                 goto out;
1217         }
1218
1219 out:
1220         if (final_depth)
1221                 *final_depth = depth;
1222         brelse(bh);
1223         return ret;
1224 }
1225
1226 /*
1227  * This function will discard the rightmost extent record.
1228  */
1229 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1230 {
1231         int next_free = le16_to_cpu(el->l_next_free_rec);
1232         int count = le16_to_cpu(el->l_count);
1233         unsigned int num_bytes;
1234
1235         BUG_ON(!next_free);
1236         /* This will cause us to go off the end of our extent list. */
1237         BUG_ON(next_free >= count);
1238
1239         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1240
1241         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1242 }
1243
1244 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1245                               struct ocfs2_extent_rec *insert_rec)
1246 {
1247         int i, insert_index, next_free, has_empty, num_bytes;
1248         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1249         struct ocfs2_extent_rec *rec;
1250
1251         next_free = le16_to_cpu(el->l_next_free_rec);
1252         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1253
1254         BUG_ON(!next_free);
1255
1256         /* The tree code before us didn't allow enough room in the leaf. */
1257         BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1258
1259         /*
1260          * The easiest way to approach this is to just remove the
1261          * empty extent and temporarily decrement next_free.
1262          */
1263         if (has_empty) {
1264                 /*
1265                  * If next_free was 1 (only an empty extent), this
1266                  * loop won't execute, which is fine. We still want
1267                  * the decrement above to happen.
1268                  */
1269                 for(i = 0; i < (next_free - 1); i++)
1270                         el->l_recs[i] = el->l_recs[i+1];
1271
1272                 next_free--;
1273         }
1274
1275         /*
1276          * Figure out what the new record index should be.
1277          */
1278         for(i = 0; i < next_free; i++) {
1279                 rec = &el->l_recs[i];
1280
1281                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
1282                         break;
1283         }
1284         insert_index = i;
1285
1286         mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1287              insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1288
1289         BUG_ON(insert_index < 0);
1290         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1291         BUG_ON(insert_index > next_free);
1292
1293         /*
1294          * No need to memmove if we're just adding to the tail.
1295          */
1296         if (insert_index != next_free) {
1297                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
1298
1299                 num_bytes = next_free - insert_index;
1300                 num_bytes *= sizeof(struct ocfs2_extent_rec);
1301                 memmove(&el->l_recs[insert_index + 1],
1302                         &el->l_recs[insert_index],
1303                         num_bytes);
1304         }
1305
1306         /*
1307          * Either we had an empty extent, and need to re-increment or
1308          * there was no empty extent on a non full rightmost leaf node,
1309          * in which case we still need to increment.
1310          */
1311         next_free++;
1312         el->l_next_free_rec = cpu_to_le16(next_free);
1313         /*
1314          * Make sure none of the math above just messed up our tree.
1315          */
1316         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1317
1318         el->l_recs[insert_index] = *insert_rec;
1319
1320 }
1321
1322 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1323 {
1324         int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1325
1326         BUG_ON(num_recs == 0);
1327
1328         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1329                 num_recs--;
1330                 size = num_recs * sizeof(struct ocfs2_extent_rec);
1331                 memmove(&el->l_recs[0], &el->l_recs[1], size);
1332                 memset(&el->l_recs[num_recs], 0,
1333                        sizeof(struct ocfs2_extent_rec));
1334                 el->l_next_free_rec = cpu_to_le16(num_recs);
1335         }
1336 }
1337
1338 /*
1339  * Create an empty extent record .
1340  *
1341  * l_next_free_rec may be updated.
1342  *
1343  * If an empty extent already exists do nothing.
1344  */
1345 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1346 {
1347         int next_free = le16_to_cpu(el->l_next_free_rec);
1348
1349         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1350
1351         if (next_free == 0)
1352                 goto set_and_inc;
1353
1354         if (ocfs2_is_empty_extent(&el->l_recs[0]))
1355                 return;
1356
1357         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1358                         "Asked to create an empty extent in a full list:\n"
1359                         "count = %u, tree depth = %u",
1360                         le16_to_cpu(el->l_count),
1361                         le16_to_cpu(el->l_tree_depth));
1362
1363         ocfs2_shift_records_right(el);
1364
1365 set_and_inc:
1366         le16_add_cpu(&el->l_next_free_rec, 1);
1367         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1368 }
1369
1370 /*
1371  * For a rotation which involves two leaf nodes, the "root node" is
1372  * the lowest level tree node which contains a path to both leafs. This
1373  * resulting set of information can be used to form a complete "subtree"
1374  *
1375  * This function is passed two full paths from the dinode down to a
1376  * pair of adjacent leaves. It's task is to figure out which path
1377  * index contains the subtree root - this can be the root index itself
1378  * in a worst-case rotation.
1379  *
1380  * The array index of the subtree root is passed back.
1381  */
1382 static int ocfs2_find_subtree_root(struct inode *inode,
1383                                    struct ocfs2_path *left,
1384                                    struct ocfs2_path *right)
1385 {
1386         int i = 0;
1387
1388         /*
1389          * Check that the caller passed in two paths from the same tree.
1390          */
1391         BUG_ON(path_root_bh(left) != path_root_bh(right));
1392
1393         do {
1394                 i++;
1395
1396                 /*
1397                  * The caller didn't pass two adjacent paths.
1398                  */
1399                 mlog_bug_on_msg(i > left->p_tree_depth,
1400                                 "Inode %lu, left depth %u, right depth %u\n"
1401                                 "left leaf blk %llu, right leaf blk %llu\n",
1402                                 inode->i_ino, left->p_tree_depth,
1403                                 right->p_tree_depth,
1404                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1405                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1406         } while (left->p_node[i].bh->b_blocknr ==
1407                  right->p_node[i].bh->b_blocknr);
1408
1409         return i - 1;
1410 }
1411
1412 typedef void (path_insert_t)(void *, struct buffer_head *);
1413
1414 /*
1415  * Traverse a btree path in search of cpos, starting at root_el.
1416  *
1417  * This code can be called with a cpos larger than the tree, in which
1418  * case it will return the rightmost path.
1419  */
1420 static int __ocfs2_find_path(struct inode *inode,
1421                              struct ocfs2_extent_list *root_el, u32 cpos,
1422                              path_insert_t *func, void *data)
1423 {
1424         int i, ret = 0;
1425         u32 range;
1426         u64 blkno;
1427         struct buffer_head *bh = NULL;
1428         struct ocfs2_extent_block *eb;
1429         struct ocfs2_extent_list *el;
1430         struct ocfs2_extent_rec *rec;
1431         struct ocfs2_inode_info *oi = OCFS2_I(inode);
1432
1433         el = root_el;
1434         while (el->l_tree_depth) {
1435                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1436                         ocfs2_error(inode->i_sb,
1437                                     "Inode %llu has empty extent list at "
1438                                     "depth %u\n",
1439                                     (unsigned long long)oi->ip_blkno,
1440                                     le16_to_cpu(el->l_tree_depth));
1441                         ret = -EROFS;
1442                         goto out;
1443
1444                 }
1445
1446                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1447                         rec = &el->l_recs[i];
1448
1449                         /*
1450                          * In the case that cpos is off the allocation
1451                          * tree, this should just wind up returning the
1452                          * rightmost record.
1453                          */
1454                         range = le32_to_cpu(rec->e_cpos) +
1455                                 ocfs2_rec_clusters(el, rec);
1456                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1457                             break;
1458                 }
1459
1460                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1461                 if (blkno == 0) {
1462                         ocfs2_error(inode->i_sb,
1463                                     "Inode %llu has bad blkno in extent list "
1464                                     "at depth %u (index %d)\n",
1465                                     (unsigned long long)oi->ip_blkno,
1466                                     le16_to_cpu(el->l_tree_depth), i);
1467                         ret = -EROFS;
1468                         goto out;
1469                 }
1470
1471                 brelse(bh);
1472                 bh = NULL;
1473                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1474                                        &bh, OCFS2_BH_CACHED, inode);
1475                 if (ret) {
1476                         mlog_errno(ret);
1477                         goto out;
1478                 }
1479
1480                 eb = (struct ocfs2_extent_block *) bh->b_data;
1481                 el = &eb->h_list;
1482                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1483                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1484                         ret = -EIO;
1485                         goto out;
1486                 }
1487
1488                 if (le16_to_cpu(el->l_next_free_rec) >
1489                     le16_to_cpu(el->l_count)) {
1490                         ocfs2_error(inode->i_sb,
1491                                     "Inode %llu has bad count in extent list "
1492                                     "at block %llu (next free=%u, count=%u)\n",
1493                                     (unsigned long long)oi->ip_blkno,
1494                                     (unsigned long long)bh->b_blocknr,
1495                                     le16_to_cpu(el->l_next_free_rec),
1496                                     le16_to_cpu(el->l_count));
1497                         ret = -EROFS;
1498                         goto out;
1499                 }
1500
1501                 if (func)
1502                         func(data, bh);
1503         }
1504
1505 out:
1506         /*
1507          * Catch any trailing bh that the loop didn't handle.
1508          */
1509         brelse(bh);
1510
1511         return ret;
1512 }
1513
1514 /*
1515  * Given an initialized path (that is, it has a valid root extent
1516  * list), this function will traverse the btree in search of the path
1517  * which would contain cpos.
1518  *
1519  * The path traveled is recorded in the path structure.
1520  *
1521  * Note that this will not do any comparisons on leaf node extent
1522  * records, so it will work fine in the case that we just added a tree
1523  * branch.
1524  */
1525 struct find_path_data {
1526         int index;
1527         struct ocfs2_path *path;
1528 };
1529 static void find_path_ins(void *data, struct buffer_head *bh)
1530 {
1531         struct find_path_data *fp = data;
1532
1533         get_bh(bh);
1534         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1535         fp->index++;
1536 }
1537 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1538                            u32 cpos)
1539 {
1540         struct find_path_data data;
1541
1542         data.index = 1;
1543         data.path = path;
1544         return __ocfs2_find_path(inode, path_root_el(path), cpos,
1545                                  find_path_ins, &data);
1546 }
1547
1548 static void find_leaf_ins(void *data, struct buffer_head *bh)
1549 {
1550         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1551         struct ocfs2_extent_list *el = &eb->h_list;
1552         struct buffer_head **ret = data;
1553
1554         /* We want to retain only the leaf block. */
1555         if (le16_to_cpu(el->l_tree_depth) == 0) {
1556                 get_bh(bh);
1557                 *ret = bh;
1558         }
1559 }
1560 /*
1561  * Find the leaf block in the tree which would contain cpos. No
1562  * checking of the actual leaf is done.
1563  *
1564  * Some paths want to call this instead of allocating a path structure
1565  * and calling ocfs2_find_path().
1566  *
1567  * This function doesn't handle non btree extent lists.
1568  */
1569 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1570                     u32 cpos, struct buffer_head **leaf_bh)
1571 {
1572         int ret;
1573         struct buffer_head *bh = NULL;
1574
1575         ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1576         if (ret) {
1577                 mlog_errno(ret);
1578                 goto out;
1579         }
1580
1581         *leaf_bh = bh;
1582 out:
1583         return ret;
1584 }
1585
1586 /*
1587  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1588  *
1589  * Basically, we've moved stuff around at the bottom of the tree and
1590  * we need to fix up the extent records above the changes to reflect
1591  * the new changes.
1592  *
1593  * left_rec: the record on the left.
1594  * left_child_el: is the child list pointed to by left_rec
1595  * right_rec: the record to the right of left_rec
1596  * right_child_el: is the child list pointed to by right_rec
1597  *
1598  * By definition, this only works on interior nodes.
1599  */
1600 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1601                                   struct ocfs2_extent_list *left_child_el,
1602                                   struct ocfs2_extent_rec *right_rec,
1603                                   struct ocfs2_extent_list *right_child_el)
1604 {
1605         u32 left_clusters, right_end;
1606
1607         /*
1608          * Interior nodes never have holes. Their cpos is the cpos of
1609          * the leftmost record in their child list. Their cluster
1610          * count covers the full theoretical range of their child list
1611          * - the range between their cpos and the cpos of the record
1612          * immediately to their right.
1613          */
1614         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1615         if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1616                 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1617                 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1618         }
1619         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1620         left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1621
1622         /*
1623          * Calculate the rightmost cluster count boundary before
1624          * moving cpos - we will need to adjust clusters after
1625          * updating e_cpos to keep the same highest cluster count.
1626          */
1627         right_end = le32_to_cpu(right_rec->e_cpos);
1628         right_end += le32_to_cpu(right_rec->e_int_clusters);
1629
1630         right_rec->e_cpos = left_rec->e_cpos;
1631         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1632
1633         right_end -= le32_to_cpu(right_rec->e_cpos);
1634         right_rec->e_int_clusters = cpu_to_le32(right_end);
1635 }
1636
1637 /*
1638  * Adjust the adjacent root node records involved in a
1639  * rotation. left_el_blkno is passed in as a key so that we can easily
1640  * find it's index in the root list.
1641  */
1642 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1643                                       struct ocfs2_extent_list *left_el,
1644                                       struct ocfs2_extent_list *right_el,
1645                                       u64 left_el_blkno)
1646 {
1647         int i;
1648
1649         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1650                le16_to_cpu(left_el->l_tree_depth));
1651
1652         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1653                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1654                         break;
1655         }
1656
1657         /*
1658          * The path walking code should have never returned a root and
1659          * two paths which are not adjacent.
1660          */
1661         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1662
1663         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1664                                       &root_el->l_recs[i + 1], right_el);
1665 }
1666
1667 /*
1668  * We've changed a leaf block (in right_path) and need to reflect that
1669  * change back up the subtree.
1670  *
1671  * This happens in multiple places:
1672  *   - When we've moved an extent record from the left path leaf to the right
1673  *     path leaf to make room for an empty extent in the left path leaf.
1674  *   - When our insert into the right path leaf is at the leftmost edge
1675  *     and requires an update of the path immediately to it's left. This
1676  *     can occur at the end of some types of rotation and appending inserts.
1677  *   - When we've adjusted the last extent record in the left path leaf and the
1678  *     1st extent record in the right path leaf during cross extent block merge.
1679  */
1680 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1681                                        struct ocfs2_path *left_path,
1682                                        struct ocfs2_path *right_path,
1683                                        int subtree_index)
1684 {
1685         int ret, i, idx;
1686         struct ocfs2_extent_list *el, *left_el, *right_el;
1687         struct ocfs2_extent_rec *left_rec, *right_rec;
1688         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1689
1690         /*
1691          * Update the counts and position values within all the
1692          * interior nodes to reflect the leaf rotation we just did.
1693          *
1694          * The root node is handled below the loop.
1695          *
1696          * We begin the loop with right_el and left_el pointing to the
1697          * leaf lists and work our way up.
1698          *
1699          * NOTE: within this loop, left_el and right_el always refer
1700          * to the *child* lists.
1701          */
1702         left_el = path_leaf_el(left_path);
1703         right_el = path_leaf_el(right_path);
1704         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1705                 mlog(0, "Adjust records at index %u\n", i);
1706
1707                 /*
1708                  * One nice property of knowing that all of these
1709                  * nodes are below the root is that we only deal with
1710                  * the leftmost right node record and the rightmost
1711                  * left node record.
1712                  */
1713                 el = left_path->p_node[i].el;
1714                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1715                 left_rec = &el->l_recs[idx];
1716
1717                 el = right_path->p_node[i].el;
1718                 right_rec = &el->l_recs[0];
1719
1720                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1721                                               right_el);
1722
1723                 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1724                 if (ret)
1725                         mlog_errno(ret);
1726
1727                 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1728                 if (ret)
1729                         mlog_errno(ret);
1730
1731                 /*
1732                  * Setup our list pointers now so that the current
1733                  * parents become children in the next iteration.
1734                  */
1735                 left_el = left_path->p_node[i].el;
1736                 right_el = right_path->p_node[i].el;
1737         }
1738
1739         /*
1740          * At the root node, adjust the two adjacent records which
1741          * begin our path to the leaves.
1742          */
1743
1744         el = left_path->p_node[subtree_index].el;
1745         left_el = left_path->p_node[subtree_index + 1].el;
1746         right_el = right_path->p_node[subtree_index + 1].el;
1747
1748         ocfs2_adjust_root_records(el, left_el, right_el,
1749                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
1750
1751         root_bh = left_path->p_node[subtree_index].bh;
1752
1753         ret = ocfs2_journal_dirty(handle, root_bh);
1754         if (ret)
1755                 mlog_errno(ret);
1756 }
1757
1758 static int ocfs2_rotate_subtree_right(struct inode *inode,
1759                                       handle_t *handle,
1760                                       struct ocfs2_path *left_path,
1761                                       struct ocfs2_path *right_path,
1762                                       int subtree_index)
1763 {
1764         int ret, i;
1765         struct buffer_head *right_leaf_bh;
1766         struct buffer_head *left_leaf_bh = NULL;
1767         struct buffer_head *root_bh;
1768         struct ocfs2_extent_list *right_el, *left_el;
1769         struct ocfs2_extent_rec move_rec;
1770
1771         left_leaf_bh = path_leaf_bh(left_path);
1772         left_el = path_leaf_el(left_path);
1773
1774         if (left_el->l_next_free_rec != left_el->l_count) {
1775                 ocfs2_error(inode->i_sb,
1776                             "Inode %llu has non-full interior leaf node %llu"
1777                             "(next free = %u)",
1778                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1779                             (unsigned long long)left_leaf_bh->b_blocknr,
1780                             le16_to_cpu(left_el->l_next_free_rec));
1781                 return -EROFS;
1782         }
1783
1784         /*
1785          * This extent block may already have an empty record, so we
1786          * return early if so.
1787          */
1788         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1789                 return 0;
1790
1791         root_bh = left_path->p_node[subtree_index].bh;
1792         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1793
1794         ret = ocfs2_journal_access(handle, inode, root_bh,
1795                                    OCFS2_JOURNAL_ACCESS_WRITE);
1796         if (ret) {
1797                 mlog_errno(ret);
1798                 goto out;
1799         }
1800
1801         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1802                 ret = ocfs2_journal_access(handle, inode,
1803                                            right_path->p_node[i].bh,
1804                                            OCFS2_JOURNAL_ACCESS_WRITE);
1805                 if (ret) {
1806                         mlog_errno(ret);
1807                         goto out;
1808                 }
1809
1810                 ret = ocfs2_journal_access(handle, inode,
1811                                            left_path->p_node[i].bh,
1812                                            OCFS2_JOURNAL_ACCESS_WRITE);
1813                 if (ret) {
1814                         mlog_errno(ret);
1815                         goto out;
1816                 }
1817         }
1818
1819         right_leaf_bh = path_leaf_bh(right_path);
1820         right_el = path_leaf_el(right_path);
1821
1822         /* This is a code error, not a disk corruption. */
1823         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1824                         "because rightmost leaf block %llu is empty\n",
1825                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1826                         (unsigned long long)right_leaf_bh->b_blocknr);
1827
1828         ocfs2_create_empty_extent(right_el);
1829
1830         ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1831         if (ret) {
1832                 mlog_errno(ret);
1833                 goto out;
1834         }
1835
1836         /* Do the copy now. */
1837         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1838         move_rec = left_el->l_recs[i];
1839         right_el->l_recs[0] = move_rec;
1840
1841         /*
1842          * Clear out the record we just copied and shift everything
1843          * over, leaving an empty extent in the left leaf.
1844          *
1845          * We temporarily subtract from next_free_rec so that the
1846          * shift will lose the tail record (which is now defunct).
1847          */
1848         le16_add_cpu(&left_el->l_next_free_rec, -1);
1849         ocfs2_shift_records_right(left_el);
1850         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1851         le16_add_cpu(&left_el->l_next_free_rec, 1);
1852
1853         ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1854         if (ret) {
1855                 mlog_errno(ret);
1856                 goto out;
1857         }
1858
1859         ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1860                                 subtree_index);
1861
1862 out:
1863         return ret;
1864 }
1865
1866 /*
1867  * Given a full path, determine what cpos value would return us a path
1868  * containing the leaf immediately to the left of the current one.
1869  *
1870  * Will return zero if the path passed in is already the leftmost path.
1871  */
1872 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1873                                          struct ocfs2_path *path, u32 *cpos)
1874 {
1875         int i, j, ret = 0;
1876         u64 blkno;
1877         struct ocfs2_extent_list *el;
1878
1879         BUG_ON(path->p_tree_depth == 0);
1880
1881         *cpos = 0;
1882
1883         blkno = path_leaf_bh(path)->b_blocknr;
1884
1885         /* Start at the tree node just above the leaf and work our way up. */
1886         i = path->p_tree_depth - 1;
1887         while (i >= 0) {
1888                 el = path->p_node[i].el;
1889
1890                 /*
1891                  * Find the extent record just before the one in our
1892                  * path.
1893                  */
1894                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1895                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1896                                 if (j == 0) {
1897                                         if (i == 0) {
1898                                                 /*
1899                                                  * We've determined that the
1900                                                  * path specified is already
1901                                                  * the leftmost one - return a
1902                                                  * cpos of zero.
1903                                                  */
1904                                                 goto out;
1905                                         }
1906                                         /*
1907                                          * The leftmost record points to our
1908                                          * leaf - we need to travel up the
1909                                          * tree one level.
1910                                          */
1911                                         goto next_node;
1912                                 }
1913
1914                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1915                                 *cpos = *cpos + ocfs2_rec_clusters(el,
1916                                                            &el->l_recs[j - 1]);
1917                                 *cpos = *cpos - 1;
1918                                 goto out;
1919                         }
1920                 }
1921
1922                 /*
1923                  * If we got here, we never found a valid node where
1924                  * the tree indicated one should be.
1925                  */
1926                 ocfs2_error(sb,
1927                             "Invalid extent tree at extent block %llu\n",
1928                             (unsigned long long)blkno);
1929                 ret = -EROFS;
1930                 goto out;
1931
1932 next_node:
1933                 blkno = path->p_node[i].bh->b_blocknr;
1934                 i--;
1935         }
1936
1937 out:
1938         return ret;
1939 }
1940
1941 /*
1942  * Extend the transaction by enough credits to complete the rotation,
1943  * and still leave at least the original number of credits allocated
1944  * to this transaction.
1945  */
1946 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1947                                            int op_credits,
1948                                            struct ocfs2_path *path)
1949 {
1950         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
1951
1952         if (handle->h_buffer_credits < credits)
1953                 return ocfs2_extend_trans(handle, credits);
1954
1955         return 0;
1956 }
1957
1958 /*
1959  * Trap the case where we're inserting into the theoretical range past
1960  * the _actual_ left leaf range. Otherwise, we'll rotate a record
1961  * whose cpos is less than ours into the right leaf.
1962  *
1963  * It's only necessary to look at the rightmost record of the left
1964  * leaf because the logic that calls us should ensure that the
1965  * theoretical ranges in the path components above the leaves are
1966  * correct.
1967  */
1968 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1969                                                  u32 insert_cpos)
1970 {
1971         struct ocfs2_extent_list *left_el;
1972         struct ocfs2_extent_rec *rec;
1973         int next_free;
1974
1975         left_el = path_leaf_el(left_path);
1976         next_free = le16_to_cpu(left_el->l_next_free_rec);
1977         rec = &left_el->l_recs[next_free - 1];
1978
1979         if (insert_cpos > le32_to_cpu(rec->e_cpos))
1980                 return 1;
1981         return 0;
1982 }
1983
1984 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
1985 {
1986         int next_free = le16_to_cpu(el->l_next_free_rec);
1987         unsigned int range;
1988         struct ocfs2_extent_rec *rec;
1989
1990         if (next_free == 0)
1991                 return 0;
1992
1993         rec = &el->l_recs[0];
1994         if (ocfs2_is_empty_extent(rec)) {
1995                 /* Empty list. */
1996                 if (next_free == 1)
1997                         return 0;
1998                 rec = &el->l_recs[1];
1999         }
2000
2001         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2002         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2003                 return 1;
2004         return 0;
2005 }
2006
2007 /*
2008  * Rotate all the records in a btree right one record, starting at insert_cpos.
2009  *
2010  * The path to the rightmost leaf should be passed in.
2011  *
2012  * The array is assumed to be large enough to hold an entire path (tree depth).
2013  *
2014  * Upon succesful return from this function:
2015  *
2016  * - The 'right_path' array will contain a path to the leaf block
2017  *   whose range contains e_cpos.
2018  * - That leaf block will have a single empty extent in list index 0.
2019  * - In the case that the rotation requires a post-insert update,
2020  *   *ret_left_path will contain a valid path which can be passed to
2021  *   ocfs2_insert_path().
2022  */
2023 static int ocfs2_rotate_tree_right(struct inode *inode,
2024                                    handle_t *handle,
2025                                    enum ocfs2_split_type split,
2026                                    u32 insert_cpos,
2027                                    struct ocfs2_path *right_path,
2028                                    struct ocfs2_path **ret_left_path)
2029 {
2030         int ret, start, orig_credits = handle->h_buffer_credits;
2031         u32 cpos;
2032         struct ocfs2_path *left_path = NULL;
2033
2034         *ret_left_path = NULL;
2035
2036         left_path = ocfs2_new_path(path_root_bh(right_path),
2037                                    path_root_el(right_path));
2038         if (!left_path) {
2039                 ret = -ENOMEM;
2040                 mlog_errno(ret);
2041                 goto out;
2042         }
2043
2044         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
2045         if (ret) {
2046                 mlog_errno(ret);
2047                 goto out;
2048         }
2049
2050         mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
2051
2052         /*
2053          * What we want to do here is:
2054          *
2055          * 1) Start with the rightmost path.
2056          *
2057          * 2) Determine a path to the leaf block directly to the left
2058          *    of that leaf.
2059          *
2060          * 3) Determine the 'subtree root' - the lowest level tree node
2061          *    which contains a path to both leaves.
2062          *
2063          * 4) Rotate the subtree.
2064          *
2065          * 5) Find the next subtree by considering the left path to be
2066          *    the new right path.
2067          *
2068          * The check at the top of this while loop also accepts
2069          * insert_cpos == cpos because cpos is only a _theoretical_
2070          * value to get us the left path - insert_cpos might very well
2071          * be filling that hole.
2072          *
2073          * Stop at a cpos of '0' because we either started at the
2074          * leftmost branch (i.e., a tree with one branch and a
2075          * rotation inside of it), or we've gone as far as we can in
2076          * rotating subtrees.
2077          */
2078         while (cpos && insert_cpos <= cpos) {
2079                 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2080                      insert_cpos, cpos);
2081
2082                 ret = ocfs2_find_path(inode, left_path, cpos);
2083                 if (ret) {
2084                         mlog_errno(ret);
2085                         goto out;
2086                 }
2087
2088                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
2089                                 path_leaf_bh(right_path),
2090                                 "Inode %lu: error during insert of %u "
2091                                 "(left path cpos %u) results in two identical "
2092                                 "paths ending at %llu\n",
2093                                 inode->i_ino, insert_cpos, cpos,
2094                                 (unsigned long long)
2095                                 path_leaf_bh(left_path)->b_blocknr);
2096
2097                 if (split == SPLIT_NONE &&
2098                     ocfs2_rotate_requires_path_adjustment(left_path,
2099                                                           insert_cpos)) {
2100
2101                         /*
2102                          * We've rotated the tree as much as we
2103                          * should. The rest is up to
2104                          * ocfs2_insert_path() to complete, after the
2105                          * record insertion. We indicate this
2106                          * situation by returning the left path.
2107                          *
2108                          * The reason we don't adjust the records here
2109                          * before the record insert is that an error
2110                          * later might break the rule where a parent
2111                          * record e_cpos will reflect the actual
2112                          * e_cpos of the 1st nonempty record of the
2113                          * child list.
2114                          */
2115                         *ret_left_path = left_path;
2116                         goto out_ret_path;
2117                 }
2118
2119                 start = ocfs2_find_subtree_root(inode, left_path, right_path);
2120
2121                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2122                      start,
2123                      (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2124                      right_path->p_tree_depth);
2125
2126                 ret = ocfs2_extend_rotate_transaction(handle, start,
2127                                                       orig_credits, right_path);
2128                 if (ret) {
2129                         mlog_errno(ret);
2130                         goto out;
2131                 }
2132
2133                 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
2134                                                  right_path, start);
2135                 if (ret) {
2136                         mlog_errno(ret);
2137                         goto out;
2138                 }
2139
2140                 if (split != SPLIT_NONE &&
2141                     ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2142                                                 insert_cpos)) {
2143                         /*
2144                          * A rotate moves the rightmost left leaf
2145                          * record over to the leftmost right leaf
2146                          * slot. If we're doing an extent split
2147                          * instead of a real insert, then we have to
2148                          * check that the extent to be split wasn't
2149                          * just moved over. If it was, then we can
2150                          * exit here, passing left_path back -
2151                          * ocfs2_split_extent() is smart enough to
2152                          * search both leaves.
2153                          */
2154                         *ret_left_path = left_path;
2155                         goto out_ret_path;
2156                 }
2157
2158                 /*
2159                  * There is no need to re-read the next right path
2160                  * as we know that it'll be our current left
2161                  * path. Optimize by copying values instead.
2162                  */
2163                 ocfs2_mv_path(right_path, left_path);
2164
2165                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
2166                                                     &cpos);
2167                 if (ret) {
2168                         mlog_errno(ret);
2169                         goto out;
2170                 }
2171         }
2172
2173 out:
2174         ocfs2_free_path(left_path);
2175
2176 out_ret_path:
2177         return ret;
2178 }
2179
2180 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2181                                       struct ocfs2_path *path)
2182 {
2183         int i, idx;
2184         struct ocfs2_extent_rec *rec;
2185         struct ocfs2_extent_list *el;
2186         struct ocfs2_extent_block *eb;
2187         u32 range;
2188
2189         /* Path should always be rightmost. */
2190         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2191         BUG_ON(eb->h_next_leaf_blk != 0ULL);
2192
2193         el = &eb->h_list;
2194         BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2195         idx = le16_to_cpu(el->l_next_free_rec) - 1;
2196         rec = &el->l_recs[idx];
2197         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2198
2199         for (i = 0; i < path->p_tree_depth; i++) {
2200                 el = path->p_node[i].el;
2201                 idx = le16_to_cpu(el->l_next_free_rec) - 1;
2202                 rec = &el->l_recs[idx];
2203
2204                 rec->e_int_clusters = cpu_to_le32(range);
2205                 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2206
2207                 ocfs2_journal_dirty(handle, path->p_node[i].bh);
2208         }
2209 }
2210
2211 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2212                               struct ocfs2_cached_dealloc_ctxt *dealloc,
2213                               struct ocfs2_path *path, int unlink_start)
2214 {
2215         int ret, i;
2216         struct ocfs2_extent_block *eb;
2217         struct ocfs2_extent_list *el;
2218         struct buffer_head *bh;
2219
2220         for(i = unlink_start; i < path_num_items(path); i++) {
2221                 bh = path->p_node[i].bh;
2222
2223                 eb = (struct ocfs2_extent_block *)bh->b_data;
2224                 /*
2225                  * Not all nodes might have had their final count
2226                  * decremented by the caller - handle this here.
2227                  */
2228                 el = &eb->h_list;
2229                 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2230                         mlog(ML_ERROR,
2231                              "Inode %llu, attempted to remove extent block "
2232                              "%llu with %u records\n",
2233                              (unsigned long long)OCFS2_I(inode)->ip_blkno,
2234                              (unsigned long long)le64_to_cpu(eb->h_blkno),
2235                              le16_to_cpu(el->l_next_free_rec));
2236
2237                         ocfs2_journal_dirty(handle, bh);
2238                         ocfs2_remove_from_cache(inode, bh);
2239                         continue;
2240                 }
2241
2242                 el->l_next_free_rec = 0;
2243                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2244
2245                 ocfs2_journal_dirty(handle, bh);
2246
2247                 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2248                 if (ret)
2249                         mlog_errno(ret);
2250
2251                 ocfs2_remove_from_cache(inode, bh);
2252         }
2253 }
2254
2255 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2256                                  struct ocfs2_path *left_path,
2257                                  struct ocfs2_path *right_path,
2258                                  int subtree_index,
2259                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
2260 {
2261         int i;
2262         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2263         struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2264         struct ocfs2_extent_list *el;
2265         struct ocfs2_extent_block *eb;
2266
2267         el = path_leaf_el(left_path);
2268
2269         eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2270
2271         for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2272                 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2273                         break;
2274
2275         BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2276
2277         memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2278         le16_add_cpu(&root_el->l_next_free_rec, -1);
2279
2280         eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2281         eb->h_next_leaf_blk = 0;
2282
2283         ocfs2_journal_dirty(handle, root_bh);
2284         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2285
2286         ocfs2_unlink_path(inode, handle, dealloc, right_path,
2287                           subtree_index + 1);
2288 }
2289
2290 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2291                                      struct ocfs2_path *left_path,
2292                                      struct ocfs2_path *right_path,
2293                                      int subtree_index,
2294                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
2295                                      int *deleted,
2296                                      struct ocfs2_extent_tree *et)
2297 {
2298         int ret, i, del_right_subtree = 0, right_has_empty = 0;
2299         struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2300         struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2301         struct ocfs2_extent_block *eb;
2302
2303         *deleted = 0;
2304
2305         right_leaf_el = path_leaf_el(right_path);
2306         left_leaf_el = path_leaf_el(left_path);
2307         root_bh = left_path->p_node[subtree_index].bh;
2308         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2309
2310         if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2311                 return 0;
2312
2313         eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2314         if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2315                 /*
2316                  * It's legal for us to proceed if the right leaf is
2317                  * the rightmost one and it has an empty extent. There
2318                  * are two cases to handle - whether the leaf will be
2319                  * empty after removal or not. If the leaf isn't empty
2320                  * then just remove the empty extent up front. The
2321                  * next block will handle empty leaves by flagging
2322                  * them for unlink.
2323                  *
2324                  * Non rightmost leaves will throw -EAGAIN and the
2325                  * caller can manually move the subtree and retry.
2326                  */
2327
2328                 if (eb->h_next_leaf_blk != 0ULL)
2329                         return -EAGAIN;
2330
2331                 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2332                         ret = ocfs2_journal_access(handle, inode,
2333                                                    path_leaf_bh(right_path),
2334                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2335                         if (ret) {
2336                                 mlog_errno(ret);
2337                                 goto out;
2338                         }
2339
2340                         ocfs2_remove_empty_extent(right_leaf_el);
2341                 } else
2342                         right_has_empty = 1;
2343         }
2344
2345         if (eb->h_next_leaf_blk == 0ULL &&
2346             le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2347                 /*
2348                  * We have to update i_last_eb_blk during the meta
2349                  * data delete.
2350                  */
2351                 ret = ocfs2_journal_access(handle, inode, et_root_bh,
2352                                            OCFS2_JOURNAL_ACCESS_WRITE);
2353                 if (ret) {
2354                         mlog_errno(ret);
2355                         goto out;
2356                 }
2357
2358                 del_right_subtree = 1;
2359         }
2360
2361         /*
2362          * Getting here with an empty extent in the right path implies
2363          * that it's the rightmost path and will be deleted.
2364          */
2365         BUG_ON(right_has_empty && !del_right_subtree);
2366
2367         ret = ocfs2_journal_access(handle, inode, root_bh,
2368                                    OCFS2_JOURNAL_ACCESS_WRITE);
2369         if (ret) {
2370                 mlog_errno(ret);
2371                 goto out;
2372         }
2373
2374         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2375                 ret = ocfs2_journal_access(handle, inode,
2376                                            right_path->p_node[i].bh,
2377                                            OCFS2_JOURNAL_ACCESS_WRITE);
2378                 if (ret) {
2379                         mlog_errno(ret);
2380                         goto out;
2381                 }
2382
2383                 ret = ocfs2_journal_access(handle, inode,
2384                                            left_path->p_node[i].bh,
2385                                            OCFS2_JOURNAL_ACCESS_WRITE);
2386                 if (ret) {
2387                         mlog_errno(ret);
2388                         goto out;
2389                 }
2390         }
2391
2392         if (!right_has_empty) {
2393                 /*
2394                  * Only do this if we're moving a real
2395                  * record. Otherwise, the action is delayed until
2396                  * after removal of the right path in which case we
2397                  * can do a simple shift to remove the empty extent.
2398                  */
2399                 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2400                 memset(&right_leaf_el->l_recs[0], 0,
2401                        sizeof(struct ocfs2_extent_rec));
2402         }
2403         if (eb->h_next_leaf_blk == 0ULL) {
2404                 /*
2405                  * Move recs over to get rid of empty extent, decrease
2406                  * next_free. This is allowed to remove the last
2407                  * extent in our leaf (setting l_next_free_rec to
2408                  * zero) - the delete code below won't care.
2409                  */
2410                 ocfs2_remove_empty_extent(right_leaf_el);
2411         }
2412
2413         ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2414         if (ret)
2415                 mlog_errno(ret);
2416         ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2417         if (ret)
2418                 mlog_errno(ret);
2419
2420         if (del_right_subtree) {
2421                 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2422                                      subtree_index, dealloc);
2423                 ocfs2_update_edge_lengths(inode, handle, left_path);
2424
2425                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2426                 ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2427
2428                 /*
2429                  * Removal of the extent in the left leaf was skipped
2430                  * above so we could delete the right path
2431                  * 1st.
2432                  */
2433                 if (right_has_empty)
2434                         ocfs2_remove_empty_extent(left_leaf_el);
2435
2436                 ret = ocfs2_journal_dirty(handle, et_root_bh);
2437                 if (ret)
2438                         mlog_errno(ret);
2439
2440                 *deleted = 1;
2441         } else
2442                 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2443                                            subtree_index);
2444
2445 out:
2446         return ret;
2447 }
2448
2449 /*
2450  * Given a full path, determine what cpos value would return us a path
2451  * containing the leaf immediately to the right of the current one.
2452  *
2453  * Will return zero if the path passed in is already the rightmost path.
2454  *
2455  * This looks similar, but is subtly different to
2456  * ocfs2_find_cpos_for_left_leaf().
2457  */
2458 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2459                                           struct ocfs2_path *path, u32 *cpos)
2460 {
2461         int i, j, ret = 0;
2462         u64 blkno;
2463         struct ocfs2_extent_list *el;
2464
2465         *cpos = 0;
2466
2467         if (path->p_tree_depth == 0)
2468                 return 0;
2469
2470         blkno = path_leaf_bh(path)->b_blocknr;
2471
2472         /* Start at the tree node just above the leaf and work our way up. */
2473         i = path->p_tree_depth - 1;
2474         while (i >= 0) {
2475                 int next_free;
2476
2477                 el = path->p_node[i].el;
2478
2479                 /*
2480                  * Find the extent record just after the one in our
2481                  * path.
2482                  */
2483                 next_free = le16_to_cpu(el->l_next_free_rec);
2484                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2485                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2486                                 if (j == (next_free - 1)) {
2487                                         if (i == 0) {
2488                                                 /*
2489                                                  * We've determined that the
2490                                                  * path specified is already
2491                                                  * the rightmost one - return a
2492                                                  * cpos of zero.
2493                                                  */
2494                                                 goto out;
2495                                         }
2496                                         /*
2497                                          * The rightmost record points to our
2498                                          * leaf - we need to travel up the
2499                                          * tree one level.
2500                                          */
2501                                         goto next_node;
2502                                 }
2503
2504                                 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2505                                 goto out;
2506                         }
2507                 }
2508
2509                 /*
2510                  * If we got here, we never found a valid node where
2511                  * the tree indicated one should be.
2512                  */
2513                 ocfs2_error(sb,
2514                             "Invalid extent tree at extent block %llu\n",
2515                             (unsigned long long)blkno);
2516                 ret = -EROFS;
2517                 goto out;
2518
2519 next_node:
2520                 blkno = path->p_node[i].bh->b_blocknr;
2521                 i--;
2522         }
2523
2524 out:
2525         return ret;
2526 }
2527
2528 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2529                                             handle_t *handle,
2530                                             struct buffer_head *bh,
2531                                             struct ocfs2_extent_list *el)
2532 {
2533         int ret;
2534
2535         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2536                 return 0;
2537
2538         ret = ocfs2_journal_access(handle, inode, bh,
2539                                    OCFS2_JOURNAL_ACCESS_WRITE);
2540         if (ret) {
2541                 mlog_errno(ret);
2542                 goto out;
2543         }
2544
2545         ocfs2_remove_empty_extent(el);
2546
2547         ret = ocfs2_journal_dirty(handle, bh);
2548         if (ret)
2549                 mlog_errno(ret);
2550
2551 out:
2552         return ret;
2553 }
2554
2555 static int __ocfs2_rotate_tree_left(struct inode *inode,
2556                                     handle_t *handle, int orig_credits,
2557                                     struct ocfs2_path *path,
2558                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2559                                     struct ocfs2_path **empty_extent_path,
2560                                     struct ocfs2_extent_tree *et)
2561 {
2562         int ret, subtree_root, deleted;
2563         u32 right_cpos;
2564         struct ocfs2_path *left_path = NULL;
2565         struct ocfs2_path *right_path = NULL;
2566
2567         BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2568
2569         *empty_extent_path = NULL;
2570
2571         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2572                                              &right_cpos);
2573         if (ret) {
2574                 mlog_errno(ret);
2575                 goto out;
2576         }
2577
2578         left_path = ocfs2_new_path(path_root_bh(path),
2579                                    path_root_el(path));
2580         if (!left_path) {
2581                 ret = -ENOMEM;
2582                 mlog_errno(ret);
2583                 goto out;
2584         }
2585
2586         ocfs2_cp_path(left_path, path);
2587
2588         right_path = ocfs2_new_path(path_root_bh(path),
2589                                     path_root_el(path));
2590         if (!right_path) {
2591                 ret = -ENOMEM;
2592                 mlog_errno(ret);
2593                 goto out;
2594         }
2595
2596         while (right_cpos) {
2597                 ret = ocfs2_find_path(inode, right_path, right_cpos);
2598                 if (ret) {
2599                         mlog_errno(ret);
2600                         goto out;
2601                 }
2602
2603                 subtree_root = ocfs2_find_subtree_root(inode, left_path,
2604                                                        right_path);
2605
2606                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2607                      subtree_root,
2608                      (unsigned long long)
2609                      right_path->p_node[subtree_root].bh->b_blocknr,
2610                      right_path->p_tree_depth);
2611
2612                 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2613                                                       orig_credits, left_path);
2614                 if (ret) {
2615                         mlog_errno(ret);
2616                         goto out;
2617                 }
2618
2619                 /*
2620                  * Caller might still want to make changes to the
2621                  * tree root, so re-add it to the journal here.
2622                  */
2623                 ret = ocfs2_journal_access(handle, inode,
2624                                            path_root_bh(left_path),
2625                                            OCFS2_JOURNAL_ACCESS_WRITE);
2626                 if (ret) {
2627                         mlog_errno(ret);
2628                         goto out;
2629                 }
2630
2631                 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2632                                                 right_path, subtree_root,
2633                                                 dealloc, &deleted, et);
2634                 if (ret == -EAGAIN) {
2635                         /*
2636                          * The rotation has to temporarily stop due to
2637                          * the right subtree having an empty
2638                          * extent. Pass it back to the caller for a
2639                          * fixup.
2640                          */
2641                         *empty_extent_path = right_path;
2642                         right_path = NULL;
2643                         goto out;
2644                 }
2645                 if (ret) {
2646                         mlog_errno(ret);
2647                         goto out;
2648                 }
2649
2650                 /*
2651                  * The subtree rotate might have removed records on
2652                  * the rightmost edge. If so, then rotation is
2653                  * complete.
2654                  */
2655                 if (deleted)
2656                         break;
2657
2658                 ocfs2_mv_path(left_path, right_path);
2659
2660                 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2661                                                      &right_cpos);
2662                 if (ret) {
2663                         mlog_errno(ret);
2664                         goto out;
2665                 }
2666         }
2667
2668 out:
2669         ocfs2_free_path(right_path);
2670         ocfs2_free_path(left_path);
2671
2672         return ret;
2673 }
2674
2675 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2676                                 struct ocfs2_path *path,
2677                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
2678                                 struct ocfs2_extent_tree *et)
2679 {
2680         int ret, subtree_index;
2681         u32 cpos;
2682         struct ocfs2_path *left_path = NULL;
2683         struct ocfs2_extent_block *eb;
2684         struct ocfs2_extent_list *el;
2685
2686
2687         ret = et->eops->sanity_check(inode, et);
2688         if (ret)
2689                 goto out;
2690         /*
2691          * There's two ways we handle this depending on
2692          * whether path is the only existing one.
2693          */
2694         ret = ocfs2_extend_rotate_transaction(handle, 0,
2695                                               handle->h_buffer_credits,
2696                                               path);
2697         if (ret) {
2698                 mlog_errno(ret);
2699                 goto out;
2700         }
2701
2702         ret = ocfs2_journal_access_path(inode, handle, path);
2703         if (ret) {
2704                 mlog_errno(ret);
2705                 goto out;
2706         }
2707
2708         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2709         if (ret) {
2710                 mlog_errno(ret);
2711                 goto out;
2712         }
2713
2714         if (cpos) {
2715                 /*
2716                  * We have a path to the left of this one - it needs
2717                  * an update too.
2718                  */
2719                 left_path = ocfs2_new_path(path_root_bh(path),
2720                                            path_root_el(path));
2721                 if (!left_path) {
2722                         ret = -ENOMEM;
2723                         mlog_errno(ret);
2724                         goto out;
2725                 }
2726
2727                 ret = ocfs2_find_path(inode, left_path, cpos);
2728                 if (ret) {
2729                         mlog_errno(ret);
2730                         goto out;
2731                 }
2732
2733                 ret = ocfs2_journal_access_path(inode, handle, left_path);
2734                 if (ret) {
2735                         mlog_errno(ret);
2736                         goto out;
2737                 }
2738
2739                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2740
2741                 ocfs2_unlink_subtree(inode, handle, left_path, path,
2742                                      subtree_index, dealloc);
2743                 ocfs2_update_edge_lengths(inode, handle, left_path);
2744
2745                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2746                 ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2747         } else {
2748                 /*
2749                  * 'path' is also the leftmost path which
2750                  * means it must be the only one. This gets
2751                  * handled differently because we want to
2752                  * revert the inode back to having extents
2753                  * in-line.
2754                  */
2755                 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2756
2757                 el = et->root_el;
2758                 el->l_tree_depth = 0;
2759                 el->l_next_free_rec = 0;
2760                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2761
2762                 ocfs2_set_last_eb_blk(et, 0);
2763         }
2764
2765         ocfs2_journal_dirty(handle, path_root_bh(path));
2766
2767 out:
2768         ocfs2_free_path(left_path);
2769         return ret;
2770 }
2771
2772 /*
2773  * Left rotation of btree records.
2774  *
2775  * In many ways, this is (unsurprisingly) the opposite of right
2776  * rotation. We start at some non-rightmost path containing an empty
2777  * extent in the leaf block. The code works its way to the rightmost
2778  * path by rotating records to the left in every subtree.
2779  *
2780  * This is used by any code which reduces the number of extent records
2781  * in a leaf. After removal, an empty record should be placed in the
2782  * leftmost list position.
2783  *
2784  * This won't handle a length update of the rightmost path records if
2785  * the rightmost tree leaf record is removed so the caller is
2786  * responsible for detecting and correcting that.
2787  */
2788 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2789                                   struct ocfs2_path *path,
2790                                   struct ocfs2_cached_dealloc_ctxt *dealloc,
2791                                   struct ocfs2_extent_tree *et)
2792 {
2793         int ret, orig_credits = handle->h_buffer_credits;
2794         struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2795         struct ocfs2_extent_block *eb;
2796         struct ocfs2_extent_list *el;
2797
2798         el = path_leaf_el(path);
2799         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2800                 return 0;
2801
2802         if (path->p_tree_depth == 0) {
2803 rightmost_no_delete:
2804                 /*
2805                  * Inline extents. This is trivially handled, so do
2806                  * it up front.
2807                  */
2808                 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2809                                                        path_leaf_bh(path),
2810                                                        path_leaf_el(path));
2811                 if (ret)
2812                         mlog_errno(ret);
2813                 goto out;
2814         }
2815
2816         /*
2817          * Handle rightmost branch now. There's several cases:
2818          *  1) simple rotation leaving records in there. That's trivial.
2819          *  2) rotation requiring a branch delete - there's no more
2820          *     records left. Two cases of this:
2821          *     a) There are branches to the left.
2822          *     b) This is also the leftmost (the only) branch.
2823          *
2824          *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
2825          *  2a) we need the left branch so that we can update it with the unlink
2826          *  2b) we need to bring the inode back to inline extents.
2827          */
2828
2829         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2830         el = &eb->h_list;
2831         if (eb->h_next_leaf_blk == 0) {
2832                 /*
2833                  * This gets a bit tricky if we're going to delete the
2834                  * rightmost path. Get the other cases out of the way
2835                  * 1st.
2836                  */
2837                 if (le16_to_cpu(el->l_next_free_rec) > 1)
2838                         goto rightmost_no_delete;
2839
2840                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
2841                         ret = -EIO;
2842                         ocfs2_error(inode->i_sb,
2843                                     "Inode %llu has empty extent block at %llu",
2844                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
2845                                     (unsigned long long)le64_to_cpu(eb->h_blkno));
2846                         goto out;
2847                 }
2848
2849                 /*
2850                  * XXX: The caller can not trust "path" any more after
2851                  * this as it will have been deleted. What do we do?
2852                  *
2853                  * In theory the rotate-for-merge code will never get
2854                  * here because it'll always ask for a rotate in a
2855                  * nonempty list.
2856                  */
2857
2858                 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2859                                                   dealloc, et);
2860                 if (ret)
2861                         mlog_errno(ret);
2862                 goto out;
2863         }
2864
2865         /*
2866          * Now we can loop, remembering the path we get from -EAGAIN
2867          * and restarting from there.
2868          */
2869 try_rotate:
2870         ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2871                                        dealloc, &restart_path, et);
2872         if (ret && ret != -EAGAIN) {
2873                 mlog_errno(ret);
2874                 goto out;
2875         }
2876
2877         while (ret == -EAGAIN) {
2878                 tmp_path = restart_path;
2879                 restart_path = NULL;
2880
2881                 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2882                                                tmp_path, dealloc,
2883                                                &restart_path, et);
2884                 if (ret && ret != -EAGAIN) {
2885                         mlog_errno(ret);
2886                         goto out;
2887                 }
2888
2889                 ocfs2_free_path(tmp_path);
2890                 tmp_path = NULL;
2891
2892                 if (ret == 0)
2893                         goto try_rotate;
2894         }
2895
2896 out:
2897         ocfs2_free_path(tmp_path);
2898         ocfs2_free_path(restart_path);
2899         return ret;
2900 }
2901
2902 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2903                                 int index)
2904 {
2905         struct ocfs2_extent_rec *rec = &el->l_recs[index];
2906         unsigned int size;
2907
2908         if (rec->e_leaf_clusters == 0) {
2909                 /*
2910                  * We consumed all of the merged-from record. An empty
2911                  * extent cannot exist anywhere but the 1st array
2912                  * position, so move things over if the merged-from
2913                  * record doesn't occupy that position.
2914                  *
2915                  * This creates a new empty extent so the caller
2916                  * should be smart enough to have removed any existing
2917                  * ones.
2918                  */
2919                 if (index > 0) {
2920                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2921                         size = index * sizeof(struct ocfs2_extent_rec);
2922                         memmove(&el->l_recs[1], &el->l_recs[0], size);
2923                 }
2924
2925                 /*
2926                  * Always memset - the caller doesn't check whether it
2927                  * created an empty extent, so there could be junk in
2928                  * the other fields.
2929                  */
2930                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2931         }
2932 }
2933
2934 static int ocfs2_get_right_path(struct inode *inode,
2935                                 struct ocfs2_path *left_path,
2936                                 struct ocfs2_path **ret_right_path)
2937 {
2938         int ret;
2939         u32 right_cpos;
2940         struct ocfs2_path *right_path = NULL;
2941         struct ocfs2_extent_list *left_el;
2942
2943         *ret_right_path = NULL;
2944
2945         /* This function shouldn't be called for non-trees. */
2946         BUG_ON(left_path->p_tree_depth == 0);
2947
2948         left_el = path_leaf_el(left_path);
2949         BUG_ON(left_el->l_next_free_rec != left_el->l_count);
2950
2951         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2952                                              &right_cpos);
2953         if (ret) {
2954                 mlog_errno(ret);
2955                 goto out;
2956         }
2957
2958         /* This function shouldn't be called for the rightmost leaf. */
2959         BUG_ON(right_cpos == 0);
2960
2961         right_path = ocfs2_new_path(path_root_bh(left_path),
2962                                     path_root_el(left_path));
2963         if (!right_path) {
2964                 ret = -ENOMEM;
2965                 mlog_errno(ret);
2966                 goto out;
2967         }
2968
2969         ret = ocfs2_find_path(inode, right_path, right_cpos);
2970         if (ret) {
2971                 mlog_errno(ret);
2972                 goto out;
2973         }
2974
2975         *ret_right_path = right_path;
2976 out:
2977         if (ret)
2978                 ocfs2_free_path(right_path);
2979         return ret;
2980 }
2981
2982 /*
2983  * Remove split_rec clusters from the record at index and merge them
2984  * onto the beginning of the record "next" to it.
2985  * For index < l_count - 1, the next means the extent rec at index + 1.
2986  * For index == l_count - 1, the "next" means the 1st extent rec of the
2987  * next extent block.
2988  */
2989 static int ocfs2_merge_rec_right(struct inode *inode,
2990                                  struct ocfs2_path *left_path,
2991                                  handle_t *handle,
2992                                  struct ocfs2_extent_rec *split_rec,
2993                                  int index)
2994 {
2995         int ret, next_free, i;
2996         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2997         struct ocfs2_extent_rec *left_rec;
2998         struct ocfs2_extent_rec *right_rec;
2999         struct ocfs2_extent_list *right_el;
3000         struct ocfs2_path *right_path = NULL;
3001         int subtree_index = 0;
3002         struct ocfs2_extent_list *el = path_leaf_el(left_path);
3003         struct buffer_head *bh = path_leaf_bh(left_path);
3004         struct buffer_head *root_bh = NULL;
3005
3006         BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3007         left_rec = &el->l_recs[index];
3008
3009         if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3010             le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3011                 /* we meet with a cross extent block merge. */
3012                 ret = ocfs2_get_right_path(inode, left_path, &right_path);
3013                 if (ret) {
3014                         mlog_errno(ret);
3015                         goto out;
3016                 }
3017
3018                 right_el = path_leaf_el(right_path);
3019                 next_free = le16_to_cpu(right_el->l_next_free_rec);
3020                 BUG_ON(next_free <= 0);
3021                 right_rec = &right_el->l_recs[0];
3022                 if (ocfs2_is_empty_extent(right_rec)) {
3023                         BUG_ON(next_free <= 1);
3024                         right_rec = &right_el->l_recs[1];
3025                 }
3026
3027                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3028                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3029                        le32_to_cpu(right_rec->e_cpos));
3030
3031                 subtree_index = ocfs2_find_subtree_root(inode,
3032                                                         left_path, right_path);
3033
3034                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3035                                                       handle->h_buffer_credits,
3036                                                       right_path);
3037                 if (ret) {
3038                         mlog_errno(ret);
3039                         goto out;
3040                 }
3041
3042                 root_bh = left_path->p_node[subtree_index].bh;
3043                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3044
3045                 ret = ocfs2_journal_access(handle, inode, root_bh,
3046                                            OCFS2_JOURNAL_ACCESS_WRITE);
3047                 if (ret) {
3048                         mlog_errno(ret);
3049                         goto out;
3050                 }
3051
3052                 for (i = subtree_index + 1;
3053                      i < path_num_items(right_path); i++) {
3054                         ret = ocfs2_journal_access(handle, inode,
3055                                                    right_path->p_node[i].bh,
3056                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3057                         if (ret) {
3058                                 mlog_errno(ret);
3059                                 goto out;
3060                         }
3061
3062                         ret = ocfs2_journal_access(handle, inode,
3063                                                    left_path->p_node[i].bh,
3064                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3065                         if (ret) {
3066                                 mlog_errno(ret);
3067                                 goto out;
3068                         }
3069                 }
3070
3071         } else {
3072                 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3073                 right_rec = &el->l_recs[index + 1];
3074         }
3075
3076         ret = ocfs2_journal_access(handle, inode, bh,
3077                                    OCFS2_JOURNAL_ACCESS_WRITE);
3078         if (ret) {
3079                 mlog_errno(ret);
3080                 goto out;
3081         }
3082
3083         le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3084
3085         le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3086         le64_add_cpu(&right_rec->e_blkno,
3087                      -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3088         le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3089
3090         ocfs2_cleanup_merge(el, index);
3091
3092         ret = ocfs2_journal_dirty(handle, bh);
3093         if (ret)
3094                 mlog_errno(ret);
3095
3096         if (right_path) {
3097                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3098                 if (ret)
3099                         mlog_errno(ret);
3100
3101                 ocfs2_complete_edge_insert(inode, handle, left_path,
3102                                            right_path, subtree_index);
3103         }
3104 out:
3105         if (right_path)
3106                 ocfs2_free_path(right_path);
3107         return ret;
3108 }
3109
3110 static int ocfs2_get_left_path(struct inode *inode,
3111                                struct ocfs2_path *right_path,
3112                                struct ocfs2_path **ret_left_path)
3113 {
3114         int ret;
3115         u32 left_cpos;
3116         struct ocfs2_path *left_path = NULL;
3117
3118         *ret_left_path = NULL;
3119
3120         /* This function shouldn't be called for non-trees. */
3121         BUG_ON(right_path->p_tree_depth == 0);
3122
3123         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3124                                             right_path, &left_cpos);
3125         if (ret) {
3126                 mlog_errno(ret);
3127                 goto out;
3128         }
3129
3130         /* This function shouldn't be called for the leftmost leaf. */
3131         BUG_ON(left_cpos == 0);
3132
3133         left_path = ocfs2_new_path(path_root_bh(right_path),
3134                                    path_root_el(right_path));
3135         if (!left_path) {
3136                 ret = -ENOMEM;
3137                 mlog_errno(ret);
3138                 goto out;
3139         }
3140
3141         ret = ocfs2_find_path(inode, left_path, left_cpos);
3142         if (ret) {
3143                 mlog_errno(ret);
3144                 goto out;
3145         }
3146
3147         *ret_left_path = left_path;
3148 out:
3149         if (ret)
3150                 ocfs2_free_path(left_path);
3151         return ret;
3152 }
3153
3154 /*
3155  * Remove split_rec clusters from the record at index and merge them
3156  * onto the tail of the record "before" it.
3157  * For index > 0, the "before" means the extent rec at index - 1.
3158  *
3159  * For index == 0, the "before" means the last record of the previous
3160  * extent block. And there is also a situation that we may need to
3161  * remove the rightmost leaf extent block in the right_path and change
3162  * the right path to indicate the new rightmost path.
3163  */
3164 static int ocfs2_merge_rec_left(struct inode *inode,
3165                                 struct ocfs2_path *right_path,
3166                                 handle_t *handle,
3167                                 struct ocfs2_extent_rec *split_rec,
3168                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
3169                                 struct ocfs2_extent_tree *et,
3170                                 int index)
3171 {
3172         int ret, i, subtree_index = 0, has_empty_extent = 0;
3173         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3174         struct ocfs2_extent_rec *left_rec;
3175         struct ocfs2_extent_rec *right_rec;
3176         struct ocfs2_extent_list *el = path_leaf_el(right_path);
3177         struct buffer_head *bh = path_leaf_bh(right_path);
3178         struct buffer_head *root_bh = NULL;
3179         struct ocfs2_path *left_path = NULL;
3180         struct ocfs2_extent_list *left_el;
3181
3182         BUG_ON(index < 0);
3183
3184         right_rec = &el->l_recs[index];
3185         if (index == 0) {
3186                 /* we meet with a cross extent block merge. */
3187                 ret = ocfs2_get_left_path(inode, right_path, &left_path);
3188                 if (ret) {
3189                         mlog_errno(ret);
3190                         goto out;
3191                 }
3192
3193                 left_el = path_leaf_el(left_path);
3194                 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3195                        le16_to_cpu(left_el->l_count));
3196
3197                 left_rec = &left_el->l_recs[
3198                                 le16_to_cpu(left_el->l_next_free_rec) - 1];
3199                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3200                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3201                        le32_to_cpu(split_rec->e_cpos));
3202
3203                 subtree_index = ocfs2_find_subtree_root(inode,
3204                                                         left_path, right_path);
3205
3206                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3207                                                       handle->h_buffer_credits,
3208                                                       left_path);
3209                 if (ret) {
3210                         mlog_errno(ret);
3211                         goto out;
3212                 }
3213
3214                 root_bh = left_path->p_node[subtree_index].bh;
3215                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3216
3217                 ret = ocfs2_journal_access(handle, inode, root_bh,
3218                                            OCFS2_JOURNAL_ACCESS_WRITE);
3219                 if (ret) {
3220                         mlog_errno(ret);
3221                         goto out;
3222                 }
3223
3224                 for (i = subtree_index + 1;
3225                      i < path_num_items(right_path); i++) {
3226                         ret = ocfs2_journal_access(handle, inode,
3227                                                    right_path->p_node[i].bh,
3228                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3229                         if (ret) {
3230                                 mlog_errno(ret);
3231                                 goto out;
3232                         }
3233
3234                         ret = ocfs2_journal_access(handle, inode,
3235                                                    left_path->p_node[i].bh,
3236                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3237                         if (ret) {
3238                                 mlog_errno(ret);
3239                                 goto out;
3240                         }
3241                 }
3242         } else {
3243                 left_rec = &el->l_recs[index - 1];
3244                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3245                         has_empty_extent = 1;
3246         }
3247
3248         ret = ocfs2_journal_access(handle, inode, bh,
3249                                    OCFS2_JOURNAL_ACCESS_WRITE);
3250         if (ret) {
3251                 mlog_errno(ret);
3252                 goto out;
3253         }
3254
3255         if (has_empty_extent && index == 1) {
3256                 /*
3257                  * The easy case - we can just plop the record right in.
3258                  */
3259                 *left_rec = *split_rec;
3260
3261                 has_empty_extent = 0;
3262         } else
3263                 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3264
3265         le32_add_cpu(&right_rec->e_cpos, split_clusters);
3266         le64_add_cpu(&right_rec->e_blkno,
3267                      ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3268         le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3269
3270         ocfs2_cleanup_merge(el, index);
3271
3272         ret = ocfs2_journal_dirty(handle, bh);
3273         if (ret)
3274                 mlog_errno(ret);
3275
3276         if (left_path) {
3277                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3278                 if (ret)
3279                         mlog_errno(ret);
3280
3281                 /*
3282                  * In the situation that the right_rec is empty and the extent
3283                  * block is empty also,  ocfs2_complete_edge_insert can't handle
3284                  * it and we need to delete the right extent block.
3285                  */
3286                 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3287                     le16_to_cpu(el->l_next_free_rec) == 1) {
3288
3289                         ret = ocfs2_remove_rightmost_path(inode, handle,
3290                                                           right_path,
3291                                                           dealloc, et);
3292                         if (ret) {
3293                                 mlog_errno(ret);
3294                                 goto out;
3295                         }
3296
3297                         /* Now the rightmost extent block has been deleted.
3298                          * So we use the new rightmost path.
3299                          */
3300                         ocfs2_mv_path(right_path, left_path);
3301                         left_path = NULL;
3302                 } else
3303                         ocfs2_complete_edge_insert(inode, handle, left_path,
3304                                                    right_path, subtree_index);
3305         }
3306 out:
3307         if (left_path)
3308                 ocfs2_free_path(left_path);
3309         return ret;
3310 }
3311
3312 static int ocfs2_try_to_merge_extent(struct inode *inode,
3313                                      handle_t *handle,
3314                                      struct ocfs2_path *path,
3315                                      int split_index,
3316                                      struct ocfs2_extent_rec *split_rec,
3317                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
3318                                      struct ocfs2_merge_ctxt *ctxt,
3319                                      struct ocfs2_extent_tree *et)
3320
3321 {
3322         int ret = 0;
3323         struct ocfs2_extent_list *el = path_leaf_el(path);
3324         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3325
3326         BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3327
3328         if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3329                 /*
3330                  * The merge code will need to create an empty
3331                  * extent to take the place of the newly
3332                  * emptied slot. Remove any pre-existing empty
3333                  * extents - having more than one in a leaf is
3334                  * illegal.
3335                  */
3336                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3337                                              dealloc, et);
3338                 if (ret) {
3339                         mlog_errno(ret);
3340                         goto out;
3341                 }
3342                 split_index--;
3343                 rec = &el->l_recs[split_index];
3344         }
3345
3346         if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3347                 /*
3348                  * Left-right contig implies this.
3349                  */
3350                 BUG_ON(!ctxt->c_split_covers_rec);
3351
3352                 /*
3353                  * Since the leftright insert always covers the entire
3354                  * extent, this call will delete the insert record
3355                  * entirely, resulting in an empty extent record added to
3356                  * the extent block.
3357                  *
3358                  * Since the adding of an empty extent shifts
3359                  * everything back to the right, there's no need to
3360                  * update split_index here.
3361                  *
3362                  * When the split_index is zero, we need to merge it to the
3363                  * prevoius extent block. It is more efficient and easier
3364                  * if we do merge_right first and merge_left later.
3365                  */
3366                 ret = ocfs2_merge_rec_right(inode, path,
3367                                             handle, split_rec,
3368                                             split_index);
3369                 if (ret) {
3370                         mlog_errno(ret);
3371                         goto out;
3372                 }
3373
3374                 /*
3375                  * We can only get this from logic error above.
3376                  */
3377                 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3378
3379                 /* The merge left us with an empty extent, remove it. */
3380                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3381                                              dealloc, et);
3382                 if (ret) {
3383                         mlog_errno(ret);
3384                         goto out;
3385                 }
3386
3387                 rec = &el->l_recs[split_index];
3388
3389                 /*
3390                  * Note that we don't pass split_rec here on purpose -
3391                  * we've merged it into the rec already.
3392                  */
3393                 ret = ocfs2_merge_rec_left(inode, path,
3394                                            handle, rec,
3395                                            dealloc, et,
3396                                            split_index);
3397
3398                 if (ret) {
3399                         mlog_errno(ret);
3400                         goto out;
3401                 }
3402
3403                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3404                                              dealloc, et);
3405                 /*
3406                  * Error from this last rotate is not critical, so
3407                  * print but don't bubble it up.
3408                  */
3409                 if (ret)
3410                         mlog_errno(ret);
3411                 ret = 0;
3412         } else {
3413                 /*
3414                  * Merge a record to the left or right.
3415                  *
3416                  * 'contig_type' is relative to the existing record,
3417                  * so for example, if we're "right contig", it's to
3418                  * the record on the left (hence the left merge).
3419                  */
3420                 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3421                         ret = ocfs2_merge_rec_left(inode,
3422                                                    path,
3423                                                    handle, split_rec,
3424                                                    dealloc, et,
3425                                                    split_index);
3426                         if (ret) {
3427                                 mlog_errno(ret);
3428                                 goto out;
3429                         }
3430                 } else {
3431                         ret = ocfs2_merge_rec_right(inode,
3432                                                     path,
3433                                                     handle, split_rec,
3434                                                     split_index);
3435                         if (ret) {
3436                                 mlog_errno(ret);
3437                                 goto out;
3438                         }
3439                 }
3440
3441                 if (ctxt->c_split_covers_rec) {
3442                         /*
3443                          * The merge may have left an empty extent in
3444                          * our leaf. Try to rotate it away.
3445                          */
3446                         ret = ocfs2_rotate_tree_left(inode, handle, path,
3447                                                      dealloc, et);
3448                         if (ret)
3449                                 mlog_errno(ret);
3450                         ret = 0;
3451                 }
3452         }
3453
3454 out:
3455         return ret;
3456 }
3457
3458 static void ocfs2_subtract_from_rec(struct super_block *sb,
3459                                     enum ocfs2_split_type split,
3460                                     struct ocfs2_extent_rec *rec,
3461                                     struct ocfs2_extent_rec *split_rec)
3462 {
3463         u64 len_blocks;
3464
3465         len_blocks = ocfs2_clusters_to_blocks(sb,
3466                                 le16_to_cpu(split_rec->e_leaf_clusters));
3467
3468         if (split == SPLIT_LEFT) {
3469                 /*
3470                  * Region is on the left edge of the existing
3471                  * record.
3472                  */
3473                 le32_add_cpu(&rec->e_cpos,
3474                              le16_to_cpu(split_rec->e_leaf_clusters));
3475                 le64_add_cpu(&rec->e_blkno, len_blocks);
3476                 le16_add_cpu(&rec->e_leaf_clusters,
3477                              -le16_to_cpu(split_rec->e_leaf_clusters));
3478         } else {
3479                 /*
3480                  * Region is on the right edge of the existing
3481                  * record.
3482                  */
3483                 le16_add_cpu(&rec->e_leaf_clusters,
3484                              -le16_to_cpu(split_rec->e_leaf_clusters));
3485         }
3486 }
3487
3488 /*
3489  * Do the final bits of extent record insertion at the target leaf
3490  * list. If this leaf is part of an allocation tree, it is assumed
3491  * that the tree above has been prepared.
3492  */
3493 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3494                                  struct ocfs2_extent_list *el,
3495                                  struct ocfs2_insert_type *insert,
3496                                  struct inode *inode)
3497 {
3498         int i = insert->ins_contig_index;
3499         unsigned int range;
3500         struct ocfs2_extent_rec *rec;
3501
3502         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3503
3504         if (insert->ins_split != SPLIT_NONE) {
3505                 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3506                 BUG_ON(i == -1);
3507                 rec = &el->l_recs[i];
3508                 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3509                                         insert_rec);
3510                 goto rotate;
3511         }
3512
3513         /*
3514          * Contiguous insert - either left or right.
3515          */
3516         if (insert->ins_contig != CONTIG_NONE) {
3517                 rec = &el->l_recs[i];
3518                 if (insert->ins_contig == CONTIG_LEFT) {
3519                         rec->e_blkno = insert_rec->e_blkno;
3520                         rec->e_cpos = insert_rec->e_cpos;
3521                 }
3522                 le16_add_cpu(&rec->e_leaf_clusters,
3523                              le16_to_cpu(insert_rec->e_leaf_clusters));
3524                 return;
3525         }
3526
3527         /*
3528          * Handle insert into an empty leaf.
3529          */
3530         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3531             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3532              ocfs2_is_empty_extent(&el->l_recs[0]))) {
3533                 el->l_recs[0] = *insert_rec;
3534                 el->l_next_free_rec = cpu_to_le16(1);
3535                 return;
3536         }
3537
3538         /*
3539          * Appending insert.
3540          */
3541         if (insert->ins_appending == APPEND_TAIL) {
3542                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3543                 rec = &el->l_recs[i];
3544                 range = le32_to_cpu(rec->e_cpos)
3545                         + le16_to_cpu(rec->e_leaf_clusters);
3546                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3547
3548                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3549                                 le16_to_cpu(el->l_count),
3550                                 "inode %lu, depth %u, count %u, next free %u, "
3551                                 "rec.cpos %u, rec.clusters %u, "
3552                                 "insert.cpos %u, insert.clusters %u\n",
3553                                 inode->i_ino,
3554                                 le16_to_cpu(el->l_tree_depth),
3555                                 le16_to_cpu(el->l_count),
3556                                 le16_to_cpu(el->l_next_free_rec),
3557                                 le32_to_cpu(el->l_recs[i].e_cpos),
3558                                 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3559                                 le32_to_cpu(insert_rec->e_cpos),
3560                                 le16_to_cpu(insert_rec->e_leaf_clusters));
3561                 i++;
3562                 el->l_recs[i] = *insert_rec;
3563                 le16_add_cpu(&el->l_next_free_rec, 1);
3564                 return;
3565         }
3566
3567 rotate:
3568         /*
3569          * Ok, we have to rotate.
3570          *
3571          * At this point, it is safe to assume that inserting into an
3572          * empty leaf and appending to a leaf have both been handled
3573          * above.
3574          *
3575          * This leaf needs to have space, either by the empty 1st
3576          * extent record, or by virtue of an l_next_rec < l_count.
3577          */
3578         ocfs2_rotate_leaf(el, insert_rec);
3579 }
3580
3581 static void ocfs2_adjust_rightmost_records(struct inode *inode,
3582                                            handle_t *handle,
3583                                            struct ocfs2_path *path,
3584                                            struct ocfs2_extent_rec *insert_rec)
3585 {
3586         int ret, i, next_free;
3587         struct buffer_head *bh;
3588         struct ocfs2_extent_list *el;
3589         struct ocfs2_extent_rec *rec;
3590
3591         /*
3592          * Update everything except the leaf block.
3593          */
3594         for (i = 0; i < path->p_tree_depth; i++) {
3595                 bh = path->p_node[i].bh;
3596                 el = path->p_node[i].el;
3597
3598                 next_free = le16_to_cpu(el->l_next_free_rec);
3599                 if (next_free == 0) {
3600                         ocfs2_error(inode->i_sb,
3601                                     "Dinode %llu has a bad extent list",
3602                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
3603                         ret = -EIO;
3604                         return;
3605                 }
3606
3607                 rec = &el->l_recs[next_free - 1];
3608
3609                 rec->e_int_clusters = insert_rec->e_cpos;
3610                 le32_add_cpu(&rec->e_int_clusters,
3611                              le16_to_cpu(insert_rec->e_leaf_clusters));
3612                 le32_add_cpu(&rec->e_int_clusters,
3613                              -le32_to_cpu(rec->e_cpos));
3614
3615                 ret = ocfs2_journal_dirty(handle, bh);
3616                 if (ret)
3617                         mlog_errno(ret);
3618
3619         }
3620 }
3621
3622 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3623                                     struct ocfs2_extent_rec *insert_rec,
3624                                     struct ocfs2_path *right_path,
3625                                     struct ocfs2_path **ret_left_path)
3626 {
3627         int ret, next_free;
3628         struct ocfs2_extent_list *el;
3629         struct ocfs2_path *left_path = NULL;
3630
3631         *ret_left_path = NULL;
3632
3633         /*
3634          * This shouldn't happen for non-trees. The extent rec cluster
3635          * count manipulation below only works for interior nodes.
3636          */
3637         BUG_ON(right_path->p_tree_depth == 0);
3638
3639         /*
3640          * If our appending insert is at the leftmost edge of a leaf,
3641          * then we might need to update the rightmost records of the
3642          * neighboring path.
3643          */
3644         el = path_leaf_el(right_path);
3645         next_free = le16_to_cpu(el->l_next_free_rec);
3646         if (next_free == 0 ||
3647             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3648                 u32 left_cpos;
3649
3650                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3651                                                     &left_cpos);
3652                 if (ret) {
3653                         mlog_errno(ret);
3654                         goto out;
3655                 }
3656
3657                 mlog(0, "Append may need a left path update. cpos: %u, "
3658                      "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3659                      left_cpos);
3660
3661                 /*
3662                  * No need to worry if the append is already in the
3663                  * leftmost leaf.
3664                  */
3665                 if (left_cpos) {
3666                         left_path = ocfs2_new_path(path_root_bh(right_path),
3667                                                    path_root_el(right_path));
3668                         if (!left_path) {
3669                                 ret = -ENOMEM;
3670                                 mlog_errno(ret);
3671                                 goto out;
3672                         }
3673
3674                         ret = ocfs2_find_path(inode, left_path, left_cpos);
3675                         if (ret) {
3676                                 mlog_errno(ret);
3677                                 goto out;
3678                         }
3679
3680                         /*
3681                          * ocfs2_insert_path() will pass the left_path to the
3682                          * journal for us.
3683                          */
3684                 }
3685         }
3686
3687         ret = ocfs2_journal_access_path(inode, handle, right_path);
3688         if (ret) {
3689                 mlog_errno(ret);
3690                 goto out;
3691         }
3692
3693         ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3694
3695         *ret_left_path = left_path;
3696         ret = 0;
3697 out:
3698         if (ret != 0)
3699                 ocfs2_free_path(left_path);
3700
3701         return ret;
3702 }
3703
3704 static void ocfs2_split_record(struct inode *inode,
3705                                struct ocfs2_path *left_path,
3706                                struct ocfs2_path *right_path,
3707                                struct ocfs2_extent_rec *split_rec,
3708                                enum ocfs2_split_type split)
3709 {
3710         int index;
3711         u32 cpos = le32_to_cpu(split_rec->e_cpos);
3712         struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3713         struct ocfs2_extent_rec *rec, *tmprec;
3714
3715         right_el = path_leaf_el(right_path);;
3716         if (left_path)
3717                 left_el = path_leaf_el(left_path);
3718
3719         el = right_el;
3720         insert_el = right_el;
3721         index = ocfs2_search_extent_list(el, cpos);
3722         if (index != -1) {
3723                 if (index == 0 && left_path) {
3724                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3725
3726                         /*
3727                          * This typically means that the record
3728                          * started in the left path but moved to the
3729                          * right as a result of rotation. We either
3730                          * move the existing record to the left, or we
3731                          * do the later insert there.
3732                          *
3733                          * In this case, the left path should always
3734                          * exist as the rotate code will have passed
3735                          * it back for a post-insert update.
3736                          */
3737
3738                         if (split == SPLIT_LEFT) {
3739                                 /*
3740                                  * It's a left split. Since we know
3741                                  * that the rotate code gave us an
3742                                  * empty extent in the left path, we
3743                                  * can just do the insert there.
3744                                  */
3745                                 insert_el = left_el;
3746                         } else {
3747                                 /*
3748                                  * Right split - we have to move the
3749                                  * existing record over to the left
3750                                  * leaf. The insert will be into the
3751                                  * newly created empty extent in the
3752                                  * right leaf.
3753                                  */
3754                                 tmprec = &right_el->l_recs[index];
3755                                 ocfs2_rotate_leaf(left_el, tmprec);
3756                                 el = left_el;
3757
3758                                 memset(tmprec, 0, sizeof(*tmprec));
3759                                 index = ocfs2_search_extent_list(left_el, cpos);
3760                                 BUG_ON(index == -1);
3761                         }
3762                 }
3763         } else {
3764                 BUG_ON(!left_path);
3765                 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3766                 /*
3767                  * Left path is easy - we can just allow the insert to
3768                  * happen.
3769                  */
3770                 el = left_el;
3771                 insert_el = left_el;
3772                 index = ocfs2_search_extent_list(el, cpos);
3773                 BUG_ON(index == -1);
3774         }
3775
3776         rec = &el->l_recs[index];
3777         ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3778         ocfs2_rotate_leaf(insert_el, split_rec);
3779 }
3780
3781 /*
3782  * This function only does inserts on an allocation b-tree. For tree
3783  * depth = 0, ocfs2_insert_at_leaf() is called directly.
3784  *
3785  * right_path is the path we want to do the actual insert
3786  * in. left_path should only be passed in if we need to update that
3787  * portion of the tree after an edge insert.
3788  */
3789 static int ocfs2_insert_path(struct inode *inode,
3790                              handle_t *handle,
3791                              struct ocfs2_path *left_path,
3792                              struct ocfs2_path *right_path,
3793                              struct ocfs2_extent_rec *insert_rec,
3794                              struct ocfs2_insert_type *insert)
3795 {
3796         int ret, subtree_index;
3797         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3798
3799         if (left_path) {
3800                 int credits = handle->h_buffer_credits;
3801
3802                 /*
3803                  * There's a chance that left_path got passed back to
3804                  * us without being accounted for in the
3805                  * journal. Extend our transaction here to be sure we
3806                  * can change those blocks.
3807                  */
3808                 credits += left_path->p_tree_depth;
3809
3810                 ret = ocfs2_extend_trans(handle, credits);
3811                 if (ret < 0) {
3812                         mlog_errno(ret);
3813                         goto out;
3814                 }
3815
3816                 ret = ocfs2_journal_access_path(inode, handle, left_path);
3817                 if (ret < 0) {
3818                         mlog_errno(ret);
3819                         goto out;
3820                 }
3821         }
3822
3823         /*
3824          * Pass both paths to the journal. The majority of inserts
3825          * will be touching all components anyway.
3826          */
3827         ret = ocfs2_journal_access_path(inode, handle, right_path);
3828         if (ret < 0) {
3829                 mlog_errno(ret);
3830                 goto out;
3831         }
3832
3833         if (insert->ins_split != SPLIT_NONE) {
3834                 /*
3835                  * We could call ocfs2_insert_at_leaf() for some types
3836                  * of splits, but it's easier to just let one separate
3837                  * function sort it all out.
3838                  */
3839                 ocfs2_split_record(inode, left_path, right_path,
3840                                    insert_rec, insert->ins_split);
3841
3842                 /*
3843                  * Split might have modified either leaf and we don't
3844                  * have a guarantee that the later edge insert will
3845                  * dirty this for us.
3846                  */
3847                 if (left_path)
3848                         ret = ocfs2_journal_dirty(handle,
3849                                                   path_leaf_bh(left_path));
3850                         if (ret)
3851                                 mlog_errno(ret);
3852         } else
3853                 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3854                                      insert, inode);
3855
3856         ret = ocfs2_journal_dirty(handle, leaf_bh);
3857         if (ret)
3858                 mlog_errno(ret);
3859
3860         if (left_path) {
3861                 /*
3862                  * The rotate code has indicated that we need to fix
3863                  * up portions of the tree after the insert.
3864                  *
3865                  * XXX: Should we extend the transaction here?
3866                  */
3867                 subtree_index = ocfs2_find_subtree_root(inode, left_path,
3868                                                         right_path);
3869                 ocfs2_complete_edge_insert(inode, handle, left_path,
3870                                            right_path, subtree_index);
3871         }
3872
3873         ret = 0;
3874 out:
3875         return ret;
3876 }
3877
3878 static int ocfs2_do_insert_extent(struct inode *inode,
3879                                   handle_t *handle,
3880                                   struct ocfs2_extent_tree *et,
3881                                   struct ocfs2_extent_rec *insert_rec,
3882                                   struct ocfs2_insert_type *type)
3883 {
3884         int ret, rotate = 0;
3885         u32 cpos;
3886         struct ocfs2_path *right_path = NULL;
3887         struct ocfs2_path *left_path = NULL;
3888         struct ocfs2_extent_list *el;
3889
3890         el = et->root_el;
3891
3892         ret = ocfs2_journal_access(handle, inode, et->root_bh,
3893                                    OCFS2_JOURNAL_ACCESS_WRITE);
3894         if (ret) {
3895                 mlog_errno(ret);
3896                 goto out;
3897         }
3898
3899         if (le16_to_cpu(el->l_tree_depth) == 0) {
3900                 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
3901                 goto out_update_clusters;
3902         }
3903
3904         right_path = ocfs2_new_path(et->root_bh, et->root_el);
3905         if (!right_path) {
3906                 ret = -ENOMEM;
3907                 mlog_errno(ret);
3908                 goto out;
3909         }
3910
3911         /*
3912          * Determine the path to start with. Rotations need the
3913          * rightmost path, everything else can go directly to the
3914          * target leaf.
3915          */
3916         cpos = le32_to_cpu(insert_rec->e_cpos);
3917         if (type->ins_appending == APPEND_NONE &&
3918             type->ins_contig == CONTIG_NONE) {
3919                 rotate = 1;
3920                 cpos = UINT_MAX;
3921         }
3922
3923         ret = ocfs2_find_path(inode, right_path, cpos);
3924         if (ret) {
3925                 mlog_errno(ret);
3926                 goto out;
3927         }
3928
3929         /*
3930          * Rotations and appends need special treatment - they modify
3931          * parts of the tree's above them.
3932          *
3933          * Both might pass back a path immediate to the left of the
3934          * one being inserted to. This will be cause
3935          * ocfs2_insert_path() to modify the rightmost records of
3936          * left_path to account for an edge insert.
3937          *
3938          * XXX: When modifying this code, keep in mind that an insert
3939          * can wind up skipping both of these two special cases...
3940          */
3941         if (rotate) {
3942                 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
3943                                               le32_to_cpu(insert_rec->e_cpos),
3944                                               right_path, &left_path);
3945                 if (ret) {
3946                         mlog_errno(ret);
3947                         goto out;
3948                 }
3949
3950                 /*
3951                  * ocfs2_rotate_tree_right() might have extended the
3952                  * transaction without re-journaling our tree root.
3953                  */
3954                 ret = ocfs2_journal_access(handle, inode, et->root_bh,
3955                                            OCFS2_JOURNAL_ACCESS_WRITE);
3956                 if (ret) {
3957                         mlog_errno(ret);
3958                         goto out;
3959                 }
3960         } else if (type->ins_appending == APPEND_TAIL
3961                    && type->ins_contig != CONTIG_LEFT) {
3962                 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
3963                                                right_path, &left_path);
3964                 if (ret) {
3965                         mlog_errno(ret);
3966                         goto out;
3967                 }
3968         }
3969
3970         ret = ocfs2_insert_path(inode, handle, left_path, right_path,
3971                                 insert_rec, type);
3972         if (ret) {
3973                 mlog_errno(ret);
3974                 goto out;
3975         }
3976
3977 out_update_clusters:
3978         if (type->ins_split == SPLIT_NONE)
3979                 ocfs2_update_clusters(inode, et,
3980                                       le16_to_cpu(insert_rec->e_leaf_clusters));
3981
3982         ret = ocfs2_journal_dirty(handle, et->root_bh);
3983         if (ret)
3984                 mlog_errno(ret);
3985
3986 out:
3987         ocfs2_free_path(left_path);
3988         ocfs2_free_path(right_path);
3989
3990         return ret;
3991 }
3992
3993 static enum ocfs2_contig_type
3994 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
3995                                struct ocfs2_extent_list *el, int index,
3996                                struct ocfs2_extent_rec *split_rec)
3997 {
3998         int status;
3999         enum ocfs2_contig_type ret = CONTIG_NONE;
4000         u32 left_cpos, right_cpos;
4001         struct ocfs2_extent_rec *rec = NULL;
4002         struct ocfs2_extent_list *new_el;
4003         struct ocfs2_path *left_path = NULL, *right_path = NULL;
4004         struct buffer_head *bh;
4005         struct ocfs2_extent_block *eb;
4006
4007         if (index > 0) {
4008                 rec = &el->l_recs[index - 1];
4009         } else if (path->p_tree_depth > 0) {
4010                 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
4011                                                        path, &left_cpos);
4012                 if (status)
4013                         goto out;
4014
4015                 if (left_cpos != 0) {
4016                         left_path = ocfs2_new_path(path_root_bh(path),
4017                                                    path_root_el(path));
4018                         if (!left_path)
4019                                 goto out;
4020
4021                         status = ocfs2_find_path(inode, left_path, left_cpos);
4022                         if (status)
4023                                 goto out;
4024
4025                         new_el = path_leaf_el(left_path);
4026
4027                         if (le16_to_cpu(new_el->l_next_free_rec) !=
4028                             le16_to_cpu(new_el->l_count)) {
4029                                 bh = path_leaf_bh(left_path);
4030                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4031                                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
4032                                                                  eb);
4033                                 goto out;
4034                         }
4035                         rec = &new_el->l_recs[
4036                                 le16_to_cpu(new_el->l_next_free_rec) - 1];
4037                 }
4038         }
4039
4040         /*
4041          * We're careful to check for an empty extent record here -
4042          * the merge code will know what to do if it sees one.
4043          */
4044         if (rec) {
4045                 if (index == 1 && ocfs2_is_empty_extent(rec)) {
4046                         if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4047                                 ret = CONTIG_RIGHT;
4048                 } else {
4049                         ret = ocfs2_extent_contig(inode, rec, split_rec);
4050                 }
4051         }
4052
4053         rec = NULL;
4054         if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4055                 rec = &el->l_recs[index + 1];
4056         else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4057                  path->p_tree_depth > 0) {
4058                 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
4059                                                         path, &right_cpos);
4060                 if (status)
4061                         goto out;
4062
4063                 if (right_cpos == 0)
4064                         goto out;
4065
4066                 right_path = ocfs2_new_path(path_root_bh(path),
4067                                             path_root_el(path));
4068                 if (!right_path)
4069                         goto out;
4070
4071                 status = ocfs2_find_path(inode, right_path, right_cpos);
4072                 if (status)
4073                         goto out;
4074
4075                 new_el = path_leaf_el(right_path);
4076                 rec = &new_el->l_recs[0];
4077                 if (ocfs2_is_empty_extent(rec)) {
4078                         if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4079                                 bh = path_leaf_bh(right_path);
4080                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4081                                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
4082                                                                  eb);
4083                                 goto out;
4084                         }
4085                         rec = &new_el->l_recs[1];
4086                 }
4087         }
4088
4089         if (rec) {
4090                 enum ocfs2_contig_type contig_type;
4091
4092                 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
4093
4094                 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4095                         ret = CONTIG_LEFTRIGHT;
4096                 else if (ret == CONTIG_NONE)
4097                         ret = contig_type;
4098         }
4099
4100 out:
4101         if (left_path)
4102                 ocfs2_free_path(left_path);
4103         if (right_path)
4104                 ocfs2_free_path(right_path);
4105
4106         return ret;
4107 }
4108
4109 static void ocfs2_figure_contig_type(struct inode *inode,
4110                                      struct ocfs2_insert_type *insert,
4111                                      struct ocfs2_extent_list *el,
4112                                      struct ocfs2_extent_rec *insert_rec)
4113 {
4114         int i;
4115         enum ocfs2_contig_type contig_type = CONTIG_NONE;
4116
4117         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4118
4119         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4120                 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
4121                                                   insert_rec);
4122                 if (contig_type != CONTIG_NONE) {
4123                         insert->ins_contig_index = i;
4124                         break;
4125                 }
4126         }
4127         insert->ins_contig = contig_type;
4128 }
4129
4130 /*
4131  * This should only be called against the righmost leaf extent list.
4132  *
4133  * ocfs2_figure_appending_type() will figure out whether we'll have to
4134  * insert at the tail of the rightmost leaf.
4135  *
4136  * This should also work against the root extent list for tree's with 0
4137  * depth. If we consider the root extent list to be the rightmost leaf node
4138  * then the logic here makes sense.
4139  */
4140 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4141                                         struct ocfs2_extent_list *el,
4142                                         struct ocfs2_extent_rec *insert_rec)
4143 {
4144         int i;
4145         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4146         struct ocfs2_extent_rec *rec;
4147
4148         insert->ins_appending = APPEND_NONE;
4149
4150         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4151
4152         if (!el->l_next_free_rec)
4153                 goto set_tail_append;
4154
4155         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4156                 /* Were all records empty? */
4157                 if (le16_to_cpu(el->l_next_free_rec) == 1)
4158                         goto set_tail_append;
4159         }
4160
4161         i = le16_to_cpu(el->l_next_free_rec) - 1;
4162         rec = &el->l_recs[i];
4163
4164         if (cpos >=
4165             (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4166                 goto set_tail_append;
4167
4168         return;
4169
4170 set_tail_append:
4171         insert->ins_appending = APPEND_TAIL;
4172 }
4173
4174 /*
4175  * Helper function called at the begining of an insert.
4176  *
4177  * This computes a few things that are commonly used in the process of
4178  * inserting into the btree:
4179  *   - Whether the new extent is contiguous with an existing one.
4180  *   - The current tree depth.
4181  *   - Whether the insert is an appending one.
4182  *   - The total # of free records in the tree.
4183  *
4184  * All of the information is stored on the ocfs2_insert_type
4185  * structure.
4186  */
4187 static int ocfs2_figure_insert_type(struct inode *inode,
4188                                     struct ocfs2_extent_tree *et,
4189                                     struct buffer_head **last_eb_bh,
4190                                     struct ocfs2_extent_rec *insert_rec,
4191                                     int *free_records,
4192                                     struct ocfs2_insert_type *insert)
4193 {
4194         int ret;
4195         struct ocfs2_extent_block *eb;
4196         struct ocfs2_extent_list *el;
4197         struct ocfs2_path *path = NULL;
4198         struct buffer_head *bh = NULL;
4199
4200         insert->ins_split = SPLIT_NONE;
4201
4202         el = et->root_el;
4203         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4204
4205         if (el->l_tree_depth) {
4206                 /*
4207                  * If we have tree depth, we read in the
4208                  * rightmost extent block ahead of time as
4209                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
4210                  * may want it later.
4211                  */
4212                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4213                                        ocfs2_get_last_eb_blk(et), &bh,
4214                                        OCFS2_BH_CACHED, inode);
4215                 if (ret) {
4216                         mlog_exit(ret);
4217                         goto out;
4218                 }
4219                 eb = (struct ocfs2_extent_block *) bh->b_data;
4220                 el = &eb->h_list;
4221         }
4222
4223         /*
4224          * Unless we have a contiguous insert, we'll need to know if
4225          * there is room left in our allocation tree for another
4226          * extent record.
4227          *
4228          * XXX: This test is simplistic, we can search for empty
4229          * extent records too.
4230          */
4231         *free_records = le16_to_cpu(el->l_count) -
4232                 le16_to_cpu(el->l_next_free_rec);
4233
4234         if (!insert->ins_tree_depth) {
4235                 ocfs2_figure_contig_type(inode, insert, el, insert_rec);
4236                 ocfs2_figure_appending_type(insert, el, insert_rec);
4237                 return 0;
4238         }
4239
4240         path = ocfs2_new_path(et->root_bh, et->root_el);
4241         if (!path) {
4242                 ret = -ENOMEM;
4243                 mlog_errno(ret);
4244                 goto out;
4245         }
4246
4247         /*
4248          * In the case that we're inserting past what the tree
4249          * currently accounts for, ocfs2_find_path() will return for
4250          * us the rightmost tree path. This is accounted for below in
4251          * the appending code.
4252          */
4253         ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4254         if (ret) {
4255                 mlog_errno(ret);
4256                 goto out;
4257         }
4258
4259         el = path_leaf_el(path);
4260
4261         /*
4262          * Now that we have the path, there's two things we want to determine:
4263          * 1) Contiguousness (also set contig_index if this is so)
4264          *
4265          * 2) Are we doing an append? We can trivially break this up
4266          *     into two types of appends: simple record append, or a
4267          *     rotate inside the tail leaf.
4268          */
4269         ocfs2_figure_contig_type(inode, insert, el, insert_rec);
4270
4271         /*
4272          * The insert code isn't quite ready to deal with all cases of
4273          * left contiguousness. Specifically, if it's an insert into
4274          * the 1st record in a leaf, it will require the adjustment of
4275          * cluster count on the last record of the path directly to it's
4276          * left. For now, just catch that case and fool the layers
4277          * above us. This works just fine for tree_depth == 0, which
4278          * is why we allow that above.
4279          */
4280         if (insert->ins_contig == CONTIG_LEFT &&
4281             insert->ins_contig_index == 0)
4282                 insert->ins_contig = CONTIG_NONE;
4283
4284         /*
4285          * Ok, so we can simply compare against last_eb to figure out
4286          * whether the path doesn't exist. This will only happen in
4287          * the case that we're doing a tail append, so maybe we can
4288          * take advantage of that information somehow.
4289          */
4290         if (ocfs2_get_last_eb_blk(et) ==
4291             path_leaf_bh(path)->b_blocknr) {
4292                 /*
4293                  * Ok, ocfs2_find_path() returned us the rightmost
4294                  * tree path. This might be an appending insert. There are
4295                  * two cases:
4296                  *    1) We're doing a true append at the tail:
4297                  *      -This might even be off the end of the leaf
4298                  *    2) We're "appending" by rotating in the tail
4299                  */
4300                 ocfs2_figure_appending_type(insert, el, insert_rec);
4301         }
4302
4303 out:
4304         ocfs2_free_path(path);
4305
4306         if (ret == 0)
4307                 *last_eb_bh = bh;
4308         else
4309                 brelse(bh);
4310         return ret;
4311 }
4312
4313 /*
4314  * Insert an extent into an inode btree.
4315  *
4316  * The caller needs to update fe->i_clusters
4317  */
4318 static int ocfs2_insert_extent(struct ocfs2_super *osb,
4319                                handle_t *handle,
4320                                struct inode *inode,
4321                                struct buffer_head *root_bh,
4322                                u32 cpos,
4323                                u64 start_blk,
4324                                u32 new_clusters,
4325                                u8 flags,
4326                                struct ocfs2_alloc_context *meta_ac,
4327                                struct ocfs2_extent_tree *et)
4328 {
4329         int status;
4330         int uninitialized_var(free_records);
4331         struct buffer_head *last_eb_bh = NULL;
4332         struct ocfs2_insert_type insert = {0, };
4333         struct ocfs2_extent_rec rec;
4334
4335         BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
4336
4337         mlog(0, "add %u clusters at position %u to inode %llu\n",
4338              new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4339
4340         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
4341                         (OCFS2_I(inode)->ip_clusters != cpos),
4342                         "Device %s, asking for sparse allocation: inode %llu, "
4343                         "cpos %u, clusters %u\n",
4344                         osb->dev_str,
4345                         (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
4346                         OCFS2_I(inode)->ip_clusters);
4347
4348         memset(&rec, 0, sizeof(rec));
4349         rec.e_cpos = cpu_to_le32(cpos);
4350         rec.e_blkno = cpu_to_le64(start_blk);
4351         rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4352         rec.e_flags = flags;
4353
4354         status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
4355                                           &free_records, &insert);
4356         if (status < 0) {
4357                 mlog_errno(status);
4358                 goto bail;
4359         }
4360
4361         mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4362              "Insert.contig_index: %d, Insert.free_records: %d, "
4363              "Insert.tree_depth: %d\n",
4364              insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4365              free_records, insert.ins_tree_depth);
4366
4367         if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4368                 status = ocfs2_grow_tree(inode, handle, et,
4369                                          &insert.ins_tree_depth, &last_eb_bh,
4370                                          meta_ac);
4371                 if (status) {
4372                         mlog_errno(status);
4373                         goto bail;
4374                 }
4375         }
4376
4377         /* Finally, we can add clusters. This might rotate the tree for us. */
4378         status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
4379         if (status < 0)
4380                 mlog_errno(status);
4381         else if (et->type == OCFS2_DINODE_EXTENT)
4382                 ocfs2_extent_map_insert_rec(inode, &rec);
4383
4384 bail:
4385         if (last_eb_bh)
4386                 brelse(last_eb_bh);
4387
4388         mlog_exit(status);
4389         return status;
4390 }
4391
4392 int ocfs2_dinode_insert_extent(struct ocfs2_super *osb,
4393                                handle_t *handle,
4394                                struct inode *inode,
4395                                struct buffer_head *root_bh,
4396                                u32 cpos,
4397                                u64 start_blk,
4398                                u32 new_clusters,
4399                                u8 flags,
4400                                struct ocfs2_alloc_context *meta_ac)
4401 {
4402         int status;
4403         struct ocfs2_extent_tree *et = NULL;
4404
4405         et = ocfs2_new_extent_tree(root_bh, OCFS2_DINODE_EXTENT, NULL);
4406         if (!et) {
4407                 status = -ENOMEM;
4408                 mlog_errno(status);
4409                 goto bail;
4410         }
4411
4412         status = ocfs2_insert_extent(osb, handle, inode, root_bh,
4413                                      cpos, start_blk, new_clusters,
4414                                      flags, meta_ac, et);
4415
4416         if (et)
4417                 ocfs2_free_extent_tree(et);
4418 bail:
4419         return status;
4420 }
4421
4422 int ocfs2_xattr_value_insert_extent(struct ocfs2_super *osb,
4423                                     handle_t *handle,
4424                                     struct inode *inode,
4425                                     struct buffer_head *root_bh,
4426                                     u32 cpos,
4427                                     u64 start_blk,
4428                                     u32 new_clusters,
4429                                     u8 flags,
4430                                     struct ocfs2_alloc_context *meta_ac,
4431                                     void *private)
4432 {
4433         int status;
4434         struct ocfs2_extent_tree *et = NULL;
4435
4436         et = ocfs2_new_extent_tree(root_bh, OCFS2_XATTR_VALUE_EXTENT, private);
4437         if (!et) {
4438                 status = -ENOMEM;
4439                 mlog_errno(status);
4440                 goto bail;
4441         }
4442
4443         status = ocfs2_insert_extent(osb, handle, inode, root_bh,
4444                                      cpos, start_blk, new_clusters,
4445                                      flags, meta_ac, et);
4446
4447         if (et)
4448                 ocfs2_free_extent_tree(et);
4449 bail:
4450         return status;
4451 }
4452
4453 int ocfs2_xattr_tree_insert_extent(struct ocfs2_super *osb,
4454                                    handle_t *handle,
4455                                    struct inode *inode,
4456                                    struct buffer_head *root_bh,
4457                                    u32 cpos,
4458                                    u64 start_blk,
4459                                    u32 new_clusters,
4460                                    u8 flags,
4461                                    struct ocfs2_alloc_context *meta_ac)
4462 {
4463         int status;
4464         struct ocfs2_extent_tree *et = NULL;
4465
4466         et = ocfs2_new_extent_tree(root_bh, OCFS2_XATTR_TREE_EXTENT, NULL);
4467         if (!et) {
4468                 status = -ENOMEM;
4469                 mlog_errno(status);
4470                 goto bail;
4471         }
4472
4473         status = ocfs2_insert_extent(osb, handle, inode, root_bh,
4474                                      cpos, start_blk, new_clusters,
4475                                      flags, meta_ac, et);
4476
4477         if (et)
4478                 ocfs2_free_extent_tree(et);
4479 bail:
4480         return status;
4481 }
4482
4483 /*
4484  * Allcate and add clusters into the extent b-tree.
4485  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4486  * The extent b-tree's root is root_el and it should be in root_bh, and
4487  * it is not limited to the file storage. Any extent tree can use this
4488  * function if it implements the proper ocfs2_extent_tree.
4489  */
4490 int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4491                                 struct inode *inode,
4492                                 u32 *logical_offset,
4493                                 u32 clusters_to_add,
4494                                 int mark_unwritten,
4495                                 struct buffer_head *root_bh,
4496                                 struct ocfs2_extent_list *root_el,
4497                                 handle_t *handle,
4498                                 struct ocfs2_alloc_context *data_ac,
4499                                 struct ocfs2_alloc_context *meta_ac,
4500                                 enum ocfs2_alloc_restarted *reason_ret,
4501                                 enum ocfs2_extent_tree_type type,
4502                                 void *private)
4503 {
4504         int status = 0;
4505         int free_extents;
4506         enum ocfs2_alloc_restarted reason = RESTART_NONE;
4507         u32 bit_off, num_bits;
4508         u64 block;
4509         u8 flags = 0;
4510
4511         BUG_ON(!clusters_to_add);
4512
4513         if (mark_unwritten)
4514                 flags = OCFS2_EXT_UNWRITTEN;
4515
4516         free_extents = ocfs2_num_free_extents(osb, inode, root_bh, type,
4517                                               private);
4518         if (free_extents < 0) {
4519                 status = free_extents;
4520                 mlog_errno(status);
4521                 goto leave;
4522         }
4523
4524         /* there are two cases which could cause us to EAGAIN in the
4525          * we-need-more-metadata case:
4526          * 1) we haven't reserved *any*
4527          * 2) we are so fragmented, we've needed to add metadata too
4528          *    many times. */
4529         if (!free_extents && !meta_ac) {
4530                 mlog(0, "we haven't reserved any metadata!\n");
4531                 status = -EAGAIN;
4532                 reason = RESTART_META;
4533                 goto leave;
4534         } else if ((!free_extents)
4535                    && (ocfs2_alloc_context_bits_left(meta_ac)
4536                        < ocfs2_extend_meta_needed(root_el))) {
4537                 mlog(0, "filesystem is really fragmented...\n");
4538                 status = -EAGAIN;
4539                 reason = RESTART_META;
4540                 goto leave;
4541         }
4542
4543         status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4544                                         clusters_to_add, &bit_off, &num_bits);
4545         if (status < 0) {
4546                 if (status != -ENOSPC)
4547                         mlog_errno(status);
4548                 goto leave;
4549         }
4550
4551         BUG_ON(num_bits > clusters_to_add);
4552
4553         /* reserve our write early -- insert_extent may update the inode */
4554         status = ocfs2_journal_access(handle, inode, root_bh,
4555                                       OCFS2_JOURNAL_ACCESS_WRITE);
4556         if (status < 0) {
4557                 mlog_errno(status);
4558                 goto leave;
4559         }
4560
4561         block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4562         mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
4563              num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4564         if (type == OCFS2_DINODE_EXTENT)
4565                 status = ocfs2_dinode_insert_extent(osb, handle, inode, root_bh,
4566                                                     *logical_offset, block,
4567                                                     num_bits, flags, meta_ac);
4568         else if (type == OCFS2_XATTR_TREE_EXTENT)
4569                 status = ocfs2_xattr_tree_insert_extent(osb, handle,
4570                                                         inode, root_bh,
4571                                                         *logical_offset,
4572                                                         block, num_bits, flags,
4573                                                         meta_ac);
4574         else
4575                 status = ocfs2_xattr_value_insert_extent(osb, handle,
4576                                                          inode, root_bh,
4577                                                          *logical_offset,
4578                                                          block, num_bits, flags,
4579                                                          meta_ac, private);
4580         if (status < 0) {
4581                 mlog_errno(status);
4582                 goto leave;
4583         }
4584
4585         status = ocfs2_journal_dirty(handle, root_bh);
4586         if (status < 0) {
4587                 mlog_errno(status);
4588                 goto leave;
4589         }
4590
4591         clusters_to_add -= num_bits;
4592         *logical_offset += num_bits;
4593
4594         if (clusters_to_add) {
4595                 mlog(0, "need to alloc once more, wanted = %u\n",
4596                      clusters_to_add);
4597                 status = -EAGAIN;
4598                 reason = RESTART_TRANS;
4599         }
4600
4601 leave:
4602         mlog_exit(status);
4603         if (reason_ret)
4604                 *reason_ret = reason;
4605         return status;
4606 }
4607
4608 static void ocfs2_make_right_split_rec(struct super_block *sb,
4609                                        struct ocfs2_extent_rec *split_rec,
4610                                        u32 cpos,
4611                                        struct ocfs2_extent_rec *rec)
4612 {
4613         u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4614         u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4615
4616         memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4617
4618         split_rec->e_cpos = cpu_to_le32(cpos);
4619         split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4620
4621         split_rec->e_blkno = rec->e_blkno;
4622         le64_add_cpu(&split_rec->e_blkno,
4623                      ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4624
4625         split_rec->e_flags = rec->e_flags;
4626 }
4627
4628 static int ocfs2_split_and_insert(struct inode *inode,
4629                                   handle_t *handle,
4630                                   struct ocfs2_path *path,
4631                                   struct ocfs2_extent_tree *et,
4632                                   struct buffer_head **last_eb_bh,
4633                                   int split_index,
4634                                   struct ocfs2_extent_rec *orig_split_rec,
4635                                   struct ocfs2_alloc_context *meta_ac)
4636 {
4637         int ret = 0, depth;
4638         unsigned int insert_range, rec_range, do_leftright = 0;
4639         struct ocfs2_extent_rec tmprec;
4640         struct ocfs2_extent_list *rightmost_el;
4641         struct ocfs2_extent_rec rec;
4642         struct ocfs2_extent_rec split_rec = *orig_split_rec;
4643         struct ocfs2_insert_type insert;
4644         struct ocfs2_extent_block *eb;
4645
4646 leftright:
4647         /*
4648          * Store a copy of the record on the stack - it might move
4649          * around as the tree is manipulated below.
4650          */
4651         rec = path_leaf_el(path)->l_recs[split_index];
4652
4653         rightmost_el = et->root_el;
4654
4655         depth = le16_to_cpu(rightmost_el->l_tree_depth);
4656         if (depth) {
4657                 BUG_ON(!(*last_eb_bh));
4658                 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4659                 rightmost_el = &eb->h_list;
4660         }
4661
4662         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4663             le16_to_cpu(rightmost_el->l_count)) {
4664                 ret = ocfs2_grow_tree(inode, handle, et,
4665                                       &depth, last_eb_bh, meta_ac);
4666                 if (ret) {
4667                         mlog_errno(ret);
4668                         goto out;
4669                 }
4670         }
4671
4672         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4673         insert.ins_appending = APPEND_NONE;
4674         insert.ins_contig = CONTIG_NONE;
4675         insert.ins_tree_depth = depth;
4676
4677         insert_range = le32_to_cpu(split_rec.e_cpos) +
4678                 le16_to_cpu(split_rec.e_leaf_clusters);
4679         rec_range = le32_to_cpu(rec.e_cpos) +
4680                 le16_to_cpu(rec.e_leaf_clusters);
4681
4682         if (split_rec.e_cpos == rec.e_cpos) {
4683                 insert.ins_split = SPLIT_LEFT;
4684         } else if (insert_range == rec_range) {
4685                 insert.ins_split = SPLIT_RIGHT;
4686         } else {
4687                 /*
4688                  * Left/right split. We fake this as a right split
4689                  * first and then make a second pass as a left split.
4690                  */
4691                 insert.ins_split = SPLIT_RIGHT;
4692
4693                 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4694                                            &rec);
4695
4696                 split_rec = tmprec;
4697
4698                 BUG_ON(do_leftright);
4699                 do_leftright = 1;
4700         }
4701
4702         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4703         if (ret) {
4704                 mlog_errno(ret);
4705                 goto out;
4706         }
4707
4708         if (do_leftright == 1) {
4709                 u32 cpos;
4710                 struct ocfs2_extent_list *el;
4711
4712                 do_leftright++;
4713                 split_rec = *orig_split_rec;
4714
4715                 ocfs2_reinit_path(path, 1);
4716
4717                 cpos = le32_to_cpu(split_rec.e_cpos);
4718                 ret = ocfs2_find_path(inode, path, cpos);
4719                 if (ret) {
4720                         mlog_errno(ret);
4721                         goto out;
4722                 }
4723
4724                 el = path_leaf_el(path);
4725                 split_index = ocfs2_search_extent_list(el, cpos);
4726                 goto leftright;
4727         }
4728 out:
4729
4730         return ret;
4731 }
4732
4733 /*
4734  * Mark part or all of the extent record at split_index in the leaf
4735  * pointed to by path as written. This removes the unwritten
4736  * extent flag.
4737  *
4738  * Care is taken to handle contiguousness so as to not grow the tree.
4739  *
4740  * meta_ac is not strictly necessary - we only truly need it if growth
4741  * of the tree is required. All other cases will degrade into a less
4742  * optimal tree layout.
4743  *
4744  * last_eb_bh should be the rightmost leaf block for any extent
4745  * btree. Since a split may grow the tree or a merge might shrink it,
4746  * the caller cannot trust the contents of that buffer after this call.
4747  *
4748  * This code is optimized for readability - several passes might be
4749  * made over certain portions of the tree. All of those blocks will
4750  * have been brought into cache (and pinned via the journal), so the
4751  * extra overhead is not expressed in terms of disk reads.
4752  */
4753 static int __ocfs2_mark_extent_written(struct inode *inode,
4754                                        struct ocfs2_extent_tree *et,
4755                                        handle_t *handle,
4756                                        struct ocfs2_path *path,
4757                                        int split_index,
4758                                        struct ocfs2_extent_rec *split_rec,
4759                                        struct ocfs2_alloc_context *meta_ac,
4760                                        struct ocfs2_cached_dealloc_ctxt *dealloc)
4761 {
4762         int ret = 0;
4763         struct ocfs2_extent_list *el = path_leaf_el(path);
4764         struct buffer_head *last_eb_bh = NULL;
4765         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
4766         struct ocfs2_merge_ctxt ctxt;
4767         struct ocfs2_extent_list *rightmost_el;
4768
4769         if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
4770                 ret = -EIO;
4771                 mlog_errno(ret);
4772                 goto out;
4773         }
4774
4775         if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
4776             ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
4777              (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
4778                 ret = -EIO;
4779                 mlog_errno(ret);
4780                 goto out;
4781         }
4782
4783         ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
4784                                                             split_index,
4785                                                             split_rec);
4786
4787         /*
4788          * The core merge / split code wants to know how much room is
4789          * left in this inodes allocation tree, so we pass the
4790          * rightmost extent list.
4791          */
4792         if (path->p_tree_depth) {
4793                 struct ocfs2_extent_block *eb;
4794
4795                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4796                                        ocfs2_get_last_eb_blk(et),
4797                                        &last_eb_bh, OCFS2_BH_CACHED, inode);
4798                 if (ret) {
4799                         mlog_exit(ret);
4800                         goto out;
4801                 }
4802
4803                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4804                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4805                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4806                         ret = -EROFS;
4807                         goto out;
4808                 }
4809
4810                 rightmost_el = &eb->h_list;
4811         } else
4812                 rightmost_el = path_root_el(path);
4813
4814         if (rec->e_cpos == split_rec->e_cpos &&
4815             rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4816                 ctxt.c_split_covers_rec = 1;
4817         else
4818                 ctxt.c_split_covers_rec = 0;
4819
4820         ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4821
4822         mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4823              split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4824              ctxt.c_split_covers_rec);
4825
4826         if (ctxt.c_contig_type == CONTIG_NONE) {
4827                 if (ctxt.c_split_covers_rec)
4828                         el->l_recs[split_index] = *split_rec;
4829                 else
4830                         ret = ocfs2_split_and_insert(inode, handle, path, et,
4831                                                      &last_eb_bh, split_index,
4832                                                      split_rec, meta_ac);
4833                 if (ret)
4834                         mlog_errno(ret);
4835         } else {
4836                 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4837                                                 split_index, split_rec,
4838                                                 dealloc, &ctxt, et);
4839                 if (ret)
4840                         mlog_errno(ret);
4841         }
4842
4843 out:
4844         brelse(last_eb_bh);
4845         return ret;
4846 }
4847
4848 /*
4849  * Mark the already-existing extent at cpos as written for len clusters.
4850  *
4851  * If the existing extent is larger than the request, initiate a
4852  * split. An attempt will be made at merging with adjacent extents.
4853  *
4854  * The caller is responsible for passing down meta_ac if we'll need it.
4855  */
4856 int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
4857                               handle_t *handle, u32 cpos, u32 len, u32 phys,
4858                               struct ocfs2_alloc_context *meta_ac,
4859                               struct ocfs2_cached_dealloc_ctxt *dealloc,
4860                               enum ocfs2_extent_tree_type et_type,
4861                               void *private)
4862 {
4863         int ret, index;
4864         u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4865         struct ocfs2_extent_rec split_rec;
4866         struct ocfs2_path *left_path = NULL;
4867         struct ocfs2_extent_list *el;
4868         struct ocfs2_extent_tree *et = NULL;
4869
4870         mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4871              inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4872
4873         if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4874                 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4875                             "that are being written to, but the feature bit "
4876                             "is not set in the super block.",
4877                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
4878                 ret = -EROFS;
4879                 goto out;
4880         }
4881
4882         et = ocfs2_new_extent_tree(root_bh, et_type, private);
4883         if (!et) {
4884                 ret = -ENOMEM;
4885                 mlog_errno(ret);
4886                 goto out;
4887         }
4888
4889         /*
4890          * XXX: This should be fixed up so that we just re-insert the
4891          * next extent records.
4892          */
4893         if (et_type == OCFS2_DINODE_EXTENT)
4894                 ocfs2_extent_map_trunc(inode, 0);
4895
4896         left_path = ocfs2_new_path(et->root_bh, et->root_el);
4897         if (!left_path) {
4898                 ret = -ENOMEM;
4899                 mlog_errno(ret);
4900                 goto out;
4901         }
4902
4903         ret = ocfs2_find_path(inode, left_path, cpos);
4904         if (ret) {
4905                 mlog_errno(ret);
4906                 goto out;
4907         }
4908         el = path_leaf_el(left_path);
4909
4910         index = ocfs2_search_extent_list(el, cpos);
4911         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4912                 ocfs2_error(inode->i_sb,
4913                             "Inode %llu has an extent at cpos %u which can no "
4914                             "longer be found.\n",
4915                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4916                 ret = -EROFS;
4917                 goto out;
4918         }
4919
4920         memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4921         split_rec.e_cpos = cpu_to_le32(cpos);
4922         split_rec.e_leaf_clusters = cpu_to_le16(len);
4923         split_rec.e_blkno = cpu_to_le64(start_blkno);
4924         split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4925         split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4926
4927         ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
4928                                           index, &split_rec, meta_ac,
4929                                           dealloc);
4930         if (ret)
4931                 mlog_errno(ret);
4932
4933 out:
4934         ocfs2_free_path(left_path);
4935         if (et)
4936                 ocfs2_free_extent_tree(et);
4937         return ret;
4938 }
4939
4940 static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
4941                             handle_t *handle, struct ocfs2_path *path,
4942                             int index, u32 new_range,
4943                             struct ocfs2_alloc_context *meta_ac)
4944 {
4945         int ret, depth, credits = handle->h_buffer_credits;
4946         struct buffer_head *last_eb_bh = NULL;
4947         struct ocfs2_extent_block *eb;
4948         struct ocfs2_extent_list *rightmost_el, *el;
4949         struct ocfs2_extent_rec split_rec;
4950         struct ocfs2_extent_rec *rec;
4951         struct ocfs2_insert_type insert;
4952
4953         /*
4954          * Setup the record to split before we grow the tree.
4955          */
4956         el = path_leaf_el(path);
4957         rec = &el->l_recs[index];
4958         ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4959
4960         depth = path->p_tree_depth;
4961         if (depth > 0) {
4962                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4963                                        ocfs2_get_last_eb_blk(et),
4964                                        &last_eb_bh, OCFS2_BH_CACHED, inode);
4965                 if (ret < 0) {
4966                         mlog_errno(ret);
4967                         goto out;
4968                 }
4969
4970                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4971                 rightmost_el = &eb->h_list;
4972         } else
4973                 rightmost_el = path_leaf_el(path);
4974
4975         credits += path->p_tree_depth +
4976                    ocfs2_extend_meta_needed(et->root_el);
4977         ret = ocfs2_extend_trans(handle, credits);
4978         if (ret) {
4979                 mlog_errno(ret);
4980                 goto out;
4981         }
4982
4983         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4984             le16_to_cpu(rightmost_el->l_count)) {
4985                 ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
4986                                       meta_ac);
4987                 if (ret) {
4988                         mlog_errno(ret);
4989                         goto out;
4990                 }
4991         }
4992
4993         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4994         insert.ins_appending = APPEND_NONE;
4995         insert.ins_contig = CONTIG_NONE;
4996         insert.ins_split = SPLIT_RIGHT;
4997         insert.ins_tree_depth = depth;
4998
4999         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
5000         if (ret)
5001                 mlog_errno(ret);
5002
5003 out:
5004         brelse(last_eb_bh);
5005         return ret;
5006 }
5007
5008 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5009                               struct ocfs2_path *path, int index,
5010                               struct ocfs2_cached_dealloc_ctxt *dealloc,
5011                               u32 cpos, u32 len,
5012                               struct ocfs2_extent_tree *et)
5013 {
5014         int ret;
5015         u32 left_cpos, rec_range, trunc_range;
5016         int wants_rotate = 0, is_rightmost_tree_rec = 0;
5017         struct super_block *sb = inode->i_sb;
5018         struct ocfs2_path *left_path = NULL;
5019         struct ocfs2_extent_list *el = path_leaf_el(path);
5020         struct ocfs2_extent_rec *rec;
5021         struct ocfs2_extent_block *eb;
5022
5023         if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5024                 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5025                 if (ret) {
5026                         mlog_errno(ret);
5027                         goto out;
5028                 }
5029
5030                 index--;
5031         }
5032
5033         if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5034             path->p_tree_depth) {
5035                 /*
5036                  * Check whether this is the rightmost tree record. If
5037                  * we remove all of this record or part of its right
5038                  * edge then an update of the record lengths above it
5039                  * will be required.
5040                  */
5041                 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5042                 if (eb->h_next_leaf_blk == 0)
5043                         is_rightmost_tree_rec = 1;
5044         }
5045
5046         rec = &el->l_recs[index];
5047         if (index == 0 && path->p_tree_depth &&
5048             le32_to_cpu(rec->e_cpos) == cpos) {
5049                 /*
5050                  * Changing the leftmost offset (via partial or whole
5051                  * record truncate) of an interior (or rightmost) path
5052                  * means we have to update the subtree that is formed
5053                  * by this leaf and the one to it's left.
5054                  *
5055                  * There are two cases we can skip:
5056                  *   1) Path is the leftmost one in our inode tree.
5057                  *   2) The leaf is rightmost and will be empty after
5058                  *      we remove the extent record - the rotate code
5059                  *      knows how to update the newly formed edge.
5060                  */
5061
5062                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
5063                                                     &left_cpos);
5064                 if (ret) {
5065                         mlog_errno(ret);
5066                         goto out;
5067                 }
5068
5069                 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5070                         left_path = ocfs2_new_path(path_root_bh(path),
5071                                                    path_root_el(path));
5072                         if (!left_path) {
5073                                 ret = -ENOMEM;
5074                                 mlog_errno(ret);
5075                                 goto out;
5076                         }
5077
5078                         ret = ocfs2_find_path(inode, left_path, left_cpos);
5079                         if (ret) {
5080                                 mlog_errno(ret);
5081                                 goto out;
5082                         }
5083                 }
5084         }
5085
5086         ret = ocfs2_extend_rotate_transaction(handle, 0,
5087                                               handle->h_buffer_credits,
5088                                               path);
5089         if (ret) {
5090                 mlog_errno(ret);
5091                 goto out;
5092         }
5093
5094         ret = ocfs2_journal_access_path(inode, handle, path);
5095         if (ret) {
5096                 mlog_errno(ret);
5097                 goto out;
5098         }
5099
5100         ret = ocfs2_journal_access_path(inode, handle, left_path);
5101         if (ret) {
5102                 mlog_errno(ret);
5103                 goto out;
5104         }
5105
5106         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5107         trunc_range = cpos + len;
5108
5109         if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5110                 int next_free;
5111
5112                 memset(rec, 0, sizeof(*rec));
5113                 ocfs2_cleanup_merge(el, index);
5114                 wants_rotate = 1;
5115
5116                 next_free = le16_to_cpu(el->l_next_free_rec);
5117                 if (is_rightmost_tree_rec && next_free > 1) {
5118                         /*
5119                          * We skip the edge update if this path will
5120                          * be deleted by the rotate code.
5121                          */
5122                         rec = &el->l_recs[next_free - 1];
5123                         ocfs2_adjust_rightmost_records(inode, handle, path,
5124                                                        rec);
5125                 }
5126         } else if (le32_to_cpu(rec->e_cpos) == cpos) {
5127                 /* Remove leftmost portion of the record. */
5128                 le32_add_cpu(&rec->e_cpos, len);
5129                 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5130                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5131         } else if (rec_range == trunc_range) {
5132                 /* Remove rightmost portion of the record */
5133                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5134                 if (is_rightmost_tree_rec)
5135                         ocfs2_adjust_rightmost_records(inode, handle, path, rec);
5136         } else {
5137                 /* Caller should have trapped this. */
5138                 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
5139                      "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
5140                      le32_to_cpu(rec->e_cpos),
5141                      le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5142                 BUG();
5143         }
5144
5145         if (left_path) {
5146                 int subtree_index;
5147
5148                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
5149                 ocfs2_complete_edge_insert(inode, handle, left_path, path,
5150                                            subtree_index);
5151         }
5152
5153         ocfs2_journal_dirty(handle, path_leaf_bh(path));
5154
5155         ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5156         if (ret) {
5157                 mlog_errno(ret);
5158                 goto out;
5159         }
5160
5161 out:
5162         ocfs2_free_path(left_path);
5163         return ret;
5164 }
5165
5166 int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
5167                         u32 cpos, u32 len, handle_t *handle,
5168                         struct ocfs2_alloc_context *meta_ac,
5169                         struct ocfs2_cached_dealloc_ctxt *dealloc,
5170                         enum ocfs2_extent_tree_type et_type,
5171                         void *private)
5172 {
5173         int ret, index;
5174         u32 rec_range, trunc_range;
5175         struct ocfs2_extent_rec *rec;
5176         struct ocfs2_extent_list *el;
5177         struct ocfs2_path *path = NULL;
5178         struct ocfs2_extent_tree *et = NULL;
5179
5180         et = ocfs2_new_extent_tree(root_bh, et_type, private);
5181         if (!et) {
5182                 ret = -ENOMEM;
5183                 mlog_errno(ret);
5184                 goto out;
5185         }
5186
5187         ocfs2_extent_map_trunc(inode, 0);
5188
5189         path = ocfs2_new_path(et->root_bh, et->root_el);
5190         if (!path) {
5191                 ret = -ENOMEM;
5192                 mlog_errno(ret);
5193                 goto out;
5194         }
5195
5196         ret = ocfs2_find_path(inode, path, cpos);
5197         if (ret) {
5198                 mlog_errno(ret);
5199                 goto out;
5200         }
5201
5202         el = path_leaf_el(path);
5203         index = ocfs2_search_extent_list(el, cpos);
5204         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5205                 ocfs2_error(inode->i_sb,
5206                             "Inode %llu has an extent at cpos %u which can no "
5207                             "longer be found.\n",
5208                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
5209                 ret = -EROFS;
5210                 goto out;
5211         }
5212
5213         /*
5214          * We have 3 cases of extent removal:
5215          *   1) Range covers the entire extent rec
5216          *   2) Range begins or ends on one edge of the extent rec
5217          *   3) Range is in the middle of the extent rec (no shared edges)
5218          *
5219          * For case 1 we remove the extent rec and left rotate to
5220          * fill the hole.
5221          *
5222          * For case 2 we just shrink the existing extent rec, with a
5223          * tree update if the shrinking edge is also the edge of an
5224          * extent block.
5225          *
5226          * For case 3 we do a right split to turn the extent rec into
5227          * something case 2 can handle.
5228          */
5229         rec = &el->l_recs[index];
5230         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5231         trunc_range = cpos + len;
5232
5233         BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5234
5235         mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
5236              "(cpos %u, len %u)\n",
5237              (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
5238              le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5239
5240         if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5241                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5242                                          cpos, len, et);
5243                 if (ret) {
5244                         mlog_errno(ret);
5245                         goto out;
5246                 }
5247         } else {
5248                 ret = ocfs2_split_tree(inode, et, handle, path, index,
5249                                        trunc_range, meta_ac);
5250                 if (ret) {
5251                         mlog_errno(ret);
5252                         goto out;
5253                 }
5254
5255                 /*
5256                  * The split could have manipulated the tree enough to
5257                  * move the record location, so we have to look for it again.
5258                  */
5259                 ocfs2_reinit_path(path, 1);
5260
5261                 ret = ocfs2_find_path(inode, path, cpos);
5262                 if (ret) {
5263                         mlog_errno(ret);
5264                         goto out;
5265                 }
5266
5267                 el = path_leaf_el(path);
5268                 index = ocfs2_search_extent_list(el, cpos);
5269                 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5270                         ocfs2_error(inode->i_sb,
5271                                     "Inode %llu: split at cpos %u lost record.",
5272                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5273                                     cpos);
5274                         ret = -EROFS;
5275                         goto out;
5276                 }
5277
5278                 /*
5279                  * Double check our values here. If anything is fishy,
5280                  * it's easier to catch it at the top level.
5281                  */
5282                 rec = &el->l_recs[index];
5283                 rec_range = le32_to_cpu(rec->e_cpos) +
5284                         ocfs2_rec_clusters(el, rec);
5285                 if (rec_range != trunc_range) {
5286                         ocfs2_error(inode->i_sb,
5287                                     "Inode %llu: error after split at cpos %u"
5288                                     "trunc len %u, existing record is (%u,%u)",
5289                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5290                                     cpos, len, le32_to_cpu(rec->e_cpos),
5291                                     ocfs2_rec_clusters(el, rec));
5292                         ret = -EROFS;
5293                         goto out;
5294                 }
5295
5296                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5297                                          cpos, len, et);
5298                 if (ret) {
5299                         mlog_errno(ret);
5300                         goto out;
5301                 }
5302         }
5303
5304 out:
5305         ocfs2_free_path(path);
5306         if (et)
5307                 ocfs2_free_extent_tree(et);
5308         return ret;
5309 }
5310
5311 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5312 {
5313         struct buffer_head *tl_bh = osb->osb_tl_bh;
5314         struct ocfs2_dinode *di;
5315         struct ocfs2_truncate_log *tl;
5316
5317         di = (struct ocfs2_dinode *) tl_bh->b_data;
5318         tl = &di->id2.i_dealloc;
5319
5320         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5321                         "slot %d, invalid truncate log parameters: used = "
5322                         "%u, count = %u\n", osb->slot_num,
5323                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5324         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5325 }
5326
5327 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5328                                            unsigned int new_start)
5329 {
5330         unsigned int tail_index;
5331         unsigned int current_tail;
5332
5333         /* No records, nothing to coalesce */
5334         if (!le16_to_cpu(tl->tl_used))
5335                 return 0;
5336
5337         tail_index = le16_to_cpu(tl->tl_used) - 1;
5338         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5339         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5340
5341         return current_tail == new_start;
5342 }
5343
5344 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5345                               handle_t *handle,
5346                               u64 start_blk,
5347                               unsigned int num_clusters)
5348 {
5349         int status, index;
5350         unsigned int start_cluster, tl_count;
5351         struct inode *tl_inode = osb->osb_tl_inode;
5352         struct buffer_head *tl_bh = osb->osb_tl_bh;
5353         struct ocfs2_dinode *di;
5354         struct ocfs2_truncate_log *tl;
5355
5356         mlog_entry("start_blk = %llu, num_clusters = %u\n",
5357                    (unsigned long long)start_blk, num_clusters);
5358
5359         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5360
5361         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5362
5363         di = (struct ocfs2_dinode *) tl_bh->b_data;
5364         tl = &di->id2.i_dealloc;
5365         if (!OCFS2_IS_VALID_DINODE(di)) {
5366                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5367                 status = -EIO;
5368                 goto bail;
5369         }
5370
5371         tl_count = le16_to_cpu(tl->tl_count);
5372         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5373                         tl_count == 0,
5374                         "Truncate record count on #%llu invalid "
5375                         "wanted %u, actual %u\n",
5376                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5377                         ocfs2_truncate_recs_per_inode(osb->sb),
5378                         le16_to_cpu(tl->tl_count));
5379
5380         /* Caller should have known to flush before calling us. */
5381         index = le16_to_cpu(tl->tl_used);
5382         if (index >= tl_count) {
5383                 status = -ENOSPC;
5384                 mlog_errno(status);
5385                 goto bail;
5386         }
5387
5388         status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5389                                       OCFS2_JOURNAL_ACCESS_WRITE);
5390         if (status < 0) {
5391                 mlog_errno(status);
5392                 goto bail;
5393         }
5394
5395         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5396              "%llu (index = %d)\n", num_clusters, start_cluster,
5397              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5398
5399         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5400                 /*
5401                  * Move index back to the record we are coalescing with.
5402                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5403                  */
5404                 index--;
5405
5406                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5407                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5408                      index, le32_to_cpu(tl->tl_recs[index].t_start),
5409                      num_clusters);
5410         } else {
5411                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5412                 tl->tl_used = cpu_to_le16(index + 1);
5413         }
5414         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5415
5416         status = ocfs2_journal_dirty(handle, tl_bh);
5417         if (status < 0) {
5418                 mlog_errno(status);
5419                 goto bail;
5420         }
5421
5422 bail:
5423         mlog_exit(status);
5424         return status;
5425 }
5426
5427 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5428                                          handle_t *handle,
5429                                          struct inode *data_alloc_inode,
5430                                          struct buffer_head *data_alloc_bh)
5431 {
5432         int status = 0;
5433         int i;
5434         unsigned int num_clusters;
5435         u64 start_blk;
5436         struct ocfs2_truncate_rec rec;
5437         struct ocfs2_dinode *di;
5438         struct ocfs2_truncate_log *tl;
5439         struct inode *tl_inode = osb->osb_tl_inode;
5440         struct buffer_head *tl_bh = osb->osb_tl_bh;
5441
5442         mlog_entry_void();
5443
5444         di = (struct ocfs2_dinode *) tl_bh->b_data;
5445         tl = &di->id2.i_dealloc;
5446         i = le16_to_cpu(tl->tl_used) - 1;
5447         while (i >= 0) {
5448                 /* Caller has given us at least enough credits to
5449                  * update the truncate log dinode */
5450                 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5451                                               OCFS2_JOURNAL_ACCESS_WRITE);
5452                 if (status < 0) {
5453                         mlog_errno(status);
5454                         goto bail;
5455                 }
5456
5457                 tl->tl_used = cpu_to_le16(i);
5458
5459                 status = ocfs2_journal_dirty(handle, tl_bh);
5460                 if (status < 0) {
5461                         mlog_errno(status);
5462                         goto bail;
5463                 }
5464
5465                 /* TODO: Perhaps we can calculate the bulk of the
5466                  * credits up front rather than extending like
5467                  * this. */
5468                 status = ocfs2_extend_trans(handle,
5469                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5470                 if (status < 0) {
5471                         mlog_errno(status);
5472                         goto bail;
5473                 }
5474
5475                 rec = tl->tl_recs[i];
5476                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5477                                                     le32_to_cpu(rec.t_start));
5478                 num_clusters = le32_to_cpu(rec.t_clusters);
5479
5480                 /* if start_blk is not set, we ignore the record as
5481                  * invalid. */
5482                 if (start_blk) {
5483                         mlog(0, "free record %d, start = %u, clusters = %u\n",
5484                              i, le32_to_cpu(rec.t_start), num_clusters);
5485
5486                         status = ocfs2_free_clusters(handle, data_alloc_inode,
5487                                                      data_alloc_bh, start_blk,
5488                                                      num_clusters);
5489                         if (status < 0) {
5490                                 mlog_errno(status);
5491                                 goto bail;
5492                         }
5493                 }
5494                 i--;
5495         }
5496
5497 bail:
5498         mlog_exit(status);
5499         return status;
5500 }
5501
5502 /* Expects you to already be holding tl_inode->i_mutex */
5503 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5504 {
5505         int status;
5506         unsigned int num_to_flush;
5507         handle_t *handle;
5508         struct inode *tl_inode = osb->osb_tl_inode;
5509         struct inode *data_alloc_inode = NULL;
5510         struct buffer_head *tl_bh = osb->osb_tl_bh;
5511         struct buffer_head *data_alloc_bh = NULL;
5512         struct ocfs2_dinode *di;
5513         struct ocfs2_truncate_log *tl;
5514
5515         mlog_entry_void();
5516
5517         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5518
5519         di = (struct ocfs2_dinode *) tl_bh->b_data;
5520         tl = &di->id2.i_dealloc;
5521         if (!OCFS2_IS_VALID_DINODE(di)) {
5522                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5523                 status = -EIO;
5524                 goto out;
5525         }
5526
5527         num_to_flush = le16_to_cpu(tl->tl_used);
5528         mlog(0, "Flush %u records from truncate log #%llu\n",
5529              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5530         if (!num_to_flush) {
5531                 status = 0;
5532                 goto out;
5533         }
5534
5535         data_alloc_inode = ocfs2_get_system_file_inode(osb,
5536                                                        GLOBAL_BITMAP_SYSTEM_INODE,
5537                                                        OCFS2_INVALID_SLOT);
5538         if (!data_alloc_inode) {
5539                 status = -EINVAL;
5540                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
5541                 goto out;
5542         }
5543
5544         mutex_lock(&data_alloc_inode->i_mutex);
5545
5546         status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5547         if (status < 0) {
5548                 mlog_errno(status);
5549                 goto out_mutex;
5550         }
5551
5552         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5553         if (IS_ERR(handle)) {
5554                 status = PTR_ERR(handle);
5555                 mlog_errno(status);
5556                 goto out_unlock;
5557         }
5558
5559         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5560                                                data_alloc_bh);
5561         if (status < 0)
5562                 mlog_errno(status);
5563
5564         ocfs2_commit_trans(osb, handle);
5565
5566 out_unlock:
5567         brelse(data_alloc_bh);
5568         ocfs2_inode_unlock(data_alloc_inode, 1);
5569
5570 out_mutex:
5571         mutex_unlock(&data_alloc_inode->i_mutex);
5572         iput(data_alloc_inode);
5573
5574 out:
5575         mlog_exit(status);
5576         return status;
5577 }
5578
5579 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5580 {
5581         int status;
5582         struct inode *tl_inode = osb->osb_tl_inode;
5583
5584         mutex_lock(&tl_inode->i_mutex);
5585         status = __ocfs2_flush_truncate_log(osb);
5586         mutex_unlock(&tl_inode->i_mutex);
5587
5588         return status;
5589 }
5590
5591 static void ocfs2_truncate_log_worker(struct work_struct *work)
5592 {
5593         int status;
5594         struct ocfs2_super *osb =
5595                 container_of(work, struct ocfs2_super,
5596                              osb_truncate_log_wq.work);
5597
5598         mlog_entry_void();
5599
5600         status = ocfs2_flush_truncate_log(osb);
5601         if (status < 0)
5602                 mlog_errno(status);
5603         else
5604                 ocfs2_init_inode_steal_slot(osb);
5605
5606         mlog_exit(status);
5607 }
5608
5609 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5610 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5611                                        int cancel)
5612 {
5613         if (osb->osb_tl_inode) {
5614                 /* We want to push off log flushes while truncates are
5615                  * still running. */
5616                 if (cancel)
5617                         cancel_delayed_work(&osb->osb_truncate_log_wq);
5618
5619                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5620                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5621         }
5622 }
5623
5624 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5625                                        int slot_num,
5626                                        struct inode **tl_inode,
5627                                        struct buffer_head **tl_bh)
5628 {
5629         int status;
5630         struct inode *inode = NULL;
5631         struct buffer_head *bh = NULL;
5632
5633         inode = ocfs2_get_system_file_inode(osb,
5634                                            TRUNCATE_LOG_SYSTEM_INODE,
5635                                            slot_num);
5636         if (!inode) {
5637                 status = -EINVAL;
5638                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5639                 goto bail;
5640         }
5641
5642         status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
5643                                   OCFS2_BH_CACHED, inode);
5644         if (status < 0) {
5645                 iput(inode);
5646                 mlog_errno(status);
5647                 goto bail;
5648         }
5649
5650         *tl_inode = inode;
5651         *tl_bh    = bh;
5652 bail:
5653         mlog_exit(status);
5654         return status;
5655 }
5656
5657 /* called during the 1st stage of node recovery. we stamp a clean
5658  * truncate log and pass back a copy for processing later. if the
5659  * truncate log does not require processing, a *tl_copy is set to
5660  * NULL. */
5661 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5662                                       int slot_num,
5663                                       struct ocfs2_dinode **tl_copy)
5664 {
5665         int status;
5666         struct inode *tl_inode = NULL;
5667         struct buffer_head *tl_bh = NULL;
5668         struct ocfs2_dinode *di;
5669         struct ocfs2_truncate_log *tl;
5670
5671         *tl_copy = NULL;
5672
5673         mlog(0, "recover truncate log from slot %d\n", slot_num);
5674
5675         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5676         if (status < 0) {
5677                 mlog_errno(status);
5678                 goto bail;
5679         }
5680
5681         di = (struct ocfs2_dinode *) tl_bh->b_data;
5682         tl = &di->id2.i_dealloc;
5683         if (!OCFS2_IS_VALID_DINODE(di)) {
5684                 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
5685                 status = -EIO;
5686                 goto bail;
5687         }
5688
5689         if (le16_to_cpu(tl->tl_used)) {
5690                 mlog(0, "We'll have %u logs to recover\n",
5691                      le16_to_cpu(tl->tl_used));
5692
5693                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
5694                 if (!(*tl_copy)) {
5695                         status = -ENOMEM;
5696                         mlog_errno(status);
5697                         goto bail;
5698                 }
5699
5700                 /* Assuming the write-out below goes well, this copy
5701                  * will be passed back to recovery for processing. */
5702                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
5703
5704                 /* All we need to do to clear the truncate log is set
5705                  * tl_used. */
5706                 tl->tl_used = 0;
5707
5708                 status = ocfs2_write_block(osb, tl_bh, tl_inode);
5709                 if (status < 0) {
5710                         mlog_errno(status);
5711                         goto bail;
5712                 }
5713         }
5714
5715 bail:
5716         if (tl_inode)
5717                 iput(tl_inode);
5718         if (tl_bh)
5719                 brelse(tl_bh);
5720
5721         if (status < 0 && (*tl_copy)) {
5722                 kfree(*tl_copy);
5723                 *tl_copy = NULL;
5724         }
5725
5726         mlog_exit(status);
5727         return status;
5728 }
5729
5730 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
5731                                          struct ocfs2_dinode *tl_copy)
5732 {
5733         int status = 0;
5734         int i;
5735         unsigned int clusters, num_recs, start_cluster;
5736         u64 start_blk;
5737         handle_t *handle;
5738         struct inode *tl_inode = osb->osb_tl_inode;
5739         struct ocfs2_truncate_log *tl;
5740
5741         mlog_entry_void();
5742
5743         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
5744                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
5745                 return -EINVAL;
5746         }
5747
5748         tl = &tl_copy->id2.i_dealloc;
5749         num_recs = le16_to_cpu(tl->tl_used);
5750         mlog(0, "cleanup %u records from %llu\n", num_recs,
5751              (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
5752
5753         mutex_lock(&tl_inode->i_mutex);
5754         for(i = 0; i < num_recs; i++) {
5755                 if (ocfs2_truncate_log_needs_flush(osb)) {
5756                         status = __ocfs2_flush_truncate_log(osb);
5757                         if (status < 0) {
5758                                 mlog_errno(status);
5759                                 goto bail_up;
5760                         }
5761                 }
5762
5763                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5764                 if (IS_ERR(handle)) {
5765                         status = PTR_ERR(handle);
5766                         mlog_errno(status);
5767                         goto bail_up;
5768                 }
5769
5770                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
5771                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
5772                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
5773
5774                 status = ocfs2_truncate_log_append(osb, handle,
5775                                                    start_blk, clusters);
5776                 ocfs2_commit_trans(osb, handle);
5777                 if (status < 0) {
5778                         mlog_errno(status);
5779                         goto bail_up;
5780                 }
5781         }
5782
5783 bail_up:
5784         mutex_unlock(&tl_inode->i_mutex);
5785
5786         mlog_exit(status);
5787         return status;
5788 }
5789
5790 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
5791 {
5792         int status;
5793         struct inode *tl_inode = osb->osb_tl_inode;
5794
5795         mlog_entry_void();
5796
5797         if (tl_inode) {
5798                 cancel_delayed_work(&osb->osb_truncate_log_wq);
5799                 flush_workqueue(ocfs2_wq);
5800
5801                 status = ocfs2_flush_truncate_log(osb);
5802                 if (status < 0)
5803                         mlog_errno(status);
5804
5805                 brelse(osb->osb_tl_bh);
5806                 iput(osb->osb_tl_inode);
5807         }
5808
5809         mlog_exit_void();
5810 }
5811
5812 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5813 {
5814         int status;
5815         struct inode *tl_inode = NULL;
5816         struct buffer_head *tl_bh = NULL;
5817
5818         mlog_entry_void();
5819
5820         status = ocfs2_get_truncate_log_info(osb,
5821                                              osb->slot_num,
5822                                              &tl_inode,
5823                                              &tl_bh);
5824         if (status < 0)
5825                 mlog_errno(status);
5826
5827         /* ocfs2_truncate_log_shutdown keys on the existence of
5828          * osb->osb_tl_inode so we don't set any of the osb variables
5829          * until we're sure all is well. */
5830         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
5831                           ocfs2_truncate_log_worker);
5832         osb->osb_tl_bh    = tl_bh;
5833         osb->osb_tl_inode = tl_inode;
5834
5835         mlog_exit(status);
5836         return status;
5837 }
5838
5839 /*
5840  * Delayed de-allocation of suballocator blocks.
5841  *
5842  * Some sets of block de-allocations might involve multiple suballocator inodes.
5843  *
5844  * The locking for this can get extremely complicated, especially when
5845  * the suballocator inodes to delete from aren't known until deep
5846  * within an unrelated codepath.
5847  *
5848  * ocfs2_extent_block structures are a good example of this - an inode
5849  * btree could have been grown by any number of nodes each allocating
5850  * out of their own suballoc inode.
5851  *
5852  * These structures allow the delay of block de-allocation until a
5853  * later time, when locking of multiple cluster inodes won't cause
5854  * deadlock.
5855  */
5856
5857 /*
5858  * Describes a single block free from a suballocator
5859  */
5860 struct ocfs2_cached_block_free {
5861         struct ocfs2_cached_block_free          *free_next;
5862         u64                                     free_blk;
5863         unsigned int                            free_bit;
5864 };
5865
5866 struct ocfs2_per_slot_free_list {
5867         struct ocfs2_per_slot_free_list         *f_next_suballocator;
5868         int                                     f_inode_type;
5869         int                                     f_slot;
5870         struct ocfs2_cached_block_free          *f_first;
5871 };
5872
5873 static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5874                                    int sysfile_type,
5875                                    int slot,
5876                                    struct ocfs2_cached_block_free *head)
5877 {
5878         int ret;
5879         u64 bg_blkno;
5880         handle_t *handle;
5881         struct inode *inode;
5882         struct buffer_head *di_bh = NULL;
5883         struct ocfs2_cached_block_free *tmp;
5884
5885         inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5886         if (!inode) {
5887                 ret = -EINVAL;
5888                 mlog_errno(ret);
5889                 goto out;
5890         }
5891
5892         mutex_lock(&inode->i_mutex);
5893
5894         ret = ocfs2_inode_lock(inode, &di_bh, 1);
5895         if (ret) {
5896                 mlog_errno(ret);
5897                 goto out_mutex;
5898         }
5899
5900         handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5901         if (IS_ERR(handle)) {
5902                 ret = PTR_ERR(handle);
5903                 mlog_errno(ret);
5904                 goto out_unlock;
5905         }
5906
5907         while (head) {
5908                 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5909                                                       head->free_bit);
5910                 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5911                      head->free_bit, (unsigned long long)head->free_blk);
5912
5913                 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5914                                                head->free_bit, bg_blkno, 1);
5915                 if (ret) {
5916                         mlog_errno(ret);
5917                         goto out_journal;
5918                 }
5919
5920                 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5921                 if (ret) {
5922                         mlog_errno(ret);
5923                         goto out_journal;
5924                 }
5925
5926                 tmp = head;
5927                 head = head->free_next;
5928                 kfree(tmp);
5929         }
5930
5931 out_journal:
5932         ocfs2_commit_trans(osb, handle);
5933
5934 out_unlock:
5935         ocfs2_inode_unlock(inode, 1);
5936         brelse(di_bh);
5937 out_mutex:
5938         mutex_unlock(&inode->i_mutex);
5939         iput(inode);
5940 out:
5941         while(head) {
5942                 /* Premature exit may have left some dangling items. */
5943                 tmp = head;
5944                 head = head->free_next;
5945                 kfree(tmp);
5946         }
5947
5948         return ret;
5949 }
5950
5951 int ocfs2_run_deallocs(struct ocfs2_super *osb,
5952                        struct ocfs2_cached_dealloc_ctxt *ctxt)
5953 {
5954         int ret = 0, ret2;
5955         struct ocfs2_per_slot_free_list *fl;
5956
5957         if (!ctxt)
5958                 return 0;
5959
5960         while (ctxt->c_first_suballocator) {
5961                 fl = ctxt->c_first_suballocator;
5962
5963                 if (fl->f_first) {
5964                         mlog(0, "Free items: (type %u, slot %d)\n",
5965                              fl->f_inode_type, fl->f_slot);
5966                         ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
5967                                                        fl->f_slot, fl->f_first);
5968                         if (ret2)
5969                                 mlog_errno(ret2);
5970                         if (!ret)
5971                                 ret = ret2;
5972                 }
5973
5974                 ctxt->c_first_suballocator = fl->f_next_suballocator;
5975                 kfree(fl);
5976         }
5977
5978         return ret;
5979 }
5980
5981 static struct ocfs2_per_slot_free_list *
5982 ocfs2_find_per_slot_free_list(int type,
5983                               int slot,
5984                               struct ocfs2_cached_dealloc_ctxt *ctxt)
5985 {
5986         struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
5987
5988         while (fl) {
5989                 if (fl->f_inode_type == type && fl->f_slot == slot)
5990                         return fl;
5991
5992                 fl = fl->f_next_suballocator;
5993         }
5994
5995         fl = kmalloc(sizeof(*fl), GFP_NOFS);
5996         if (fl) {
5997                 fl->f_inode_type = type;
5998                 fl->f_slot = slot;
5999                 fl->f_first = NULL;
6000                 fl->f_next_suballocator = ctxt->c_first_suballocator;
6001
6002                 ctxt->c_first_suballocator = fl;
6003         }
6004         return fl;
6005 }
6006
6007 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6008                                      int type, int slot, u64 blkno,
6009                                      unsigned int bit)
6010 {
6011         int ret;
6012         struct ocfs2_per_slot_free_list *fl;
6013         struct ocfs2_cached_block_free *item;
6014
6015         fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
6016         if (fl == NULL) {
6017                 ret = -ENOMEM;
6018                 mlog_errno(ret);
6019                 goto out;
6020         }
6021
6022         item = kmalloc(sizeof(*item), GFP_NOFS);
6023         if (item == NULL) {
6024                 ret = -ENOMEM;
6025                 mlog_errno(ret);
6026                 goto out;
6027         }
6028
6029         mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
6030              type, slot, bit, (unsigned long long)blkno);
6031
6032         item->free_blk = blkno;
6033         item->free_bit = bit;
6034         item->free_next = fl->f_first;
6035
6036         fl->f_first = item;
6037
6038         ret = 0;
6039 out:
6040         return ret;
6041 }
6042
6043 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6044                                          struct ocfs2_extent_block *eb)
6045 {
6046         return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6047                                          le16_to_cpu(eb->h_suballoc_slot),
6048                                          le64_to_cpu(eb->h_blkno),
6049                                          le16_to_cpu(eb->h_suballoc_bit));
6050 }
6051
6052 /* This function will figure out whether the currently last extent
6053  * block will be deleted, and if it will, what the new last extent
6054  * block will be so we can update his h_next_leaf_blk field, as well
6055  * as the dinodes i_last_eb_blk */
6056 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6057                                        unsigned int clusters_to_del,
6058                                        struct ocfs2_path *path,
6059                                        struct buffer_head **new_last_eb)
6060 {
6061         int next_free, ret = 0;
6062         u32 cpos;
6063         struct ocfs2_extent_rec *rec;
6064         struct ocfs2_extent_block *eb;
6065         struct ocfs2_extent_list *el;
6066         struct buffer_head *bh = NULL;
6067
6068         *new_last_eb = NULL;
6069
6070         /* we have no tree, so of course, no last_eb. */
6071         if (!path->p_tree_depth)
6072                 goto out;
6073
6074         /* trunc to zero special case - this makes tree_depth = 0
6075          * regardless of what it is.  */
6076         if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6077                 goto out;
6078
6079         el = path_leaf_el(path);
6080         BUG_ON(!el->l_next_free_rec);
6081
6082         /*
6083          * Make sure that this extent list will actually be empty
6084          * after we clear away the data. We can shortcut out if
6085          * there's more than one non-empty extent in the
6086          * list. Otherwise, a check of the remaining extent is
6087          * necessary.
6088          */
6089         next_free = le16_to_cpu(el->l_next_free_rec);
6090         rec = NULL;
6091         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6092                 if (next_free > 2)
6093                         goto out;
6094
6095                 /* We may have a valid extent in index 1, check it. */
6096                 if (next_free == 2)
6097                         rec = &el->l_recs[1];
6098
6099                 /*
6100                  * Fall through - no more nonempty extents, so we want
6101                  * to delete this leaf.
6102                  */
6103         } else {
6104                 if (next_free > 1)
6105                         goto out;
6106
6107                 rec = &el->l_recs[0];
6108         }
6109
6110         if (rec) {
6111                 /*
6112                  * Check it we'll only be trimming off the end of this
6113                  * cluster.
6114                  */
6115                 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6116                         goto out;
6117         }
6118
6119         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6120         if (ret) {
6121                 mlog_errno(ret);
6122                 goto out;
6123         }
6124
6125         ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
6126         if (ret) {
6127                 mlog_errno(ret);
6128                 goto out;
6129         }
6130
6131         eb = (struct ocfs2_extent_block *) bh->b_data;
6132         el = &eb->h_list;
6133         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6134                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6135                 ret = -EROFS;
6136                 goto out;
6137         }
6138
6139         *new_last_eb = bh;
6140         get_bh(*new_last_eb);
6141         mlog(0, "returning block %llu, (cpos: %u)\n",
6142              (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6143 out:
6144         brelse(bh);
6145
6146         return ret;
6147 }
6148
6149 /*
6150  * Trim some clusters off the rightmost edge of a tree. Only called
6151  * during truncate.
6152  *
6153  * The caller needs to:
6154  *   - start journaling of each path component.
6155  *   - compute and fully set up any new last ext block
6156  */
6157 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6158                            handle_t *handle, struct ocfs2_truncate_context *tc,
6159                            u32 clusters_to_del, u64 *delete_start)
6160 {
6161         int ret, i, index = path->p_tree_depth;
6162         u32 new_edge = 0;
6163         u64 deleted_eb = 0;
6164         struct buffer_head *bh;
6165         struct ocfs2_extent_list *el;
6166         struct ocfs2_extent_rec *rec;
6167
6168         *delete_start = 0;
6169
6170         while (index >= 0) {
6171                 bh = path->p_node[index].bh;
6172                 el = path->p_node[index].el;
6173
6174                 mlog(0, "traveling tree (index = %d, block = %llu)\n",
6175                      index,  (unsigned long long)bh->b_blocknr);
6176
6177                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6178
6179                 if (index !=
6180                     (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6181                         ocfs2_error(inode->i_sb,
6182                                     "Inode %lu has invalid ext. block %llu",
6183                                     inode->i_ino,
6184                                     (unsigned long long)bh->b_blocknr);
6185                         ret = -EROFS;
6186                         goto out;
6187                 }
6188
6189 find_tail_record:
6190                 i = le16_to_cpu(el->l_next_free_rec) - 1;
6191                 rec = &el->l_recs[i];
6192
6193                 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6194                      "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6195                      ocfs2_rec_clusters(el, rec),
6196                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6197                      le16_to_cpu(el->l_next_free_rec));
6198
6199                 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6200
6201                 if (le16_to_cpu(el->l_tree_depth) == 0) {
6202                         /*
6203                          * If the leaf block contains a single empty
6204                          * extent and no records, we can just remove
6205                          * the block.
6206                          */
6207                         if (i == 0 && ocfs2_is_empty_extent(rec)) {
6208                                 memset(rec, 0,
6209                                        sizeof(struct ocfs2_extent_rec));
6210                                 el->l_next_free_rec = cpu_to_le16(0);
6211
6212                                 goto delete;
6213                         }
6214
6215                         /*
6216                          * Remove any empty extents by shifting things
6217                          * left. That should make life much easier on
6218                          * the code below. This condition is rare
6219                          * enough that we shouldn't see a performance
6220                          * hit.
6221                          */
6222                         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6223                                 le16_add_cpu(&el->l_next_free_rec, -1);
6224
6225                                 for(i = 0;
6226                                     i < le16_to_cpu(el->l_next_free_rec); i++)
6227                                         el->l_recs[i] = el->l_recs[i + 1];
6228
6229                                 memset(&el->l_recs[i], 0,
6230                                        sizeof(struct ocfs2_extent_rec));
6231
6232                                 /*
6233                                  * We've modified our extent list. The
6234                                  * simplest way to handle this change
6235                                  * is to being the search from the
6236                                  * start again.
6237                                  */
6238                                 goto find_tail_record;
6239                         }
6240
6241                         le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6242
6243                         /*
6244                          * We'll use "new_edge" on our way back up the
6245                          * tree to know what our rightmost cpos is.
6246                          */
6247                         new_edge = le16_to_cpu(rec->e_leaf_clusters);
6248                         new_edge += le32_to_cpu(rec->e_cpos);
6249
6250                         /*
6251                          * The caller will use this to delete data blocks.
6252                          */
6253                         *delete_start = le64_to_cpu(rec->e_blkno)
6254                                 + ocfs2_clusters_to_blocks(inode->i_sb,
6255                                         le16_to_cpu(rec->e_leaf_clusters));
6256
6257                         /*
6258                          * If it's now empty, remove this record.
6259                          */
6260                         if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6261                                 memset(rec, 0,
6262                                        sizeof(struct ocfs2_extent_rec));
6263                                 le16_add_cpu(&el->l_next_free_rec, -1);
6264                         }
6265                 } else {
6266                         if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6267                                 memset(rec, 0,
6268                                        sizeof(struct ocfs2_extent_rec));
6269                                 le16_add_cpu(&el->l_next_free_rec, -1);
6270
6271                                 goto delete;
6272                         }
6273
6274                         /* Can this actually happen? */
6275                         if (le16_to_cpu(el->l_next_free_rec) == 0)
6276                                 goto delete;
6277
6278                         /*
6279                          * We never actually deleted any clusters
6280                          * because our leaf was empty. There's no
6281                          * reason to adjust the rightmost edge then.
6282                          */
6283                         if (new_edge == 0)
6284                                 goto delete;
6285
6286                         rec->e_int_clusters = cpu_to_le32(new_edge);
6287                         le32_add_cpu(&rec->e_int_clusters,
6288                                      -le32_to_cpu(rec->e_cpos));
6289
6290                          /*
6291                           * A deleted child record should have been
6292                           * caught above.
6293                           */
6294                          BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6295                 }
6296
6297 delete:
6298                 ret = ocfs2_journal_dirty(handle, bh);
6299                 if (ret) {
6300                         mlog_errno(ret);
6301                         goto out;
6302                 }
6303
6304                 mlog(0, "extent list container %llu, after: record %d: "
6305                      "(%u, %u, %llu), next = %u.\n",
6306                      (unsigned long long)bh->b_blocknr, i,
6307                      le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6308                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6309                      le16_to_cpu(el->l_next_free_rec));
6310
6311                 /*
6312                  * We must be careful to only attempt delete of an
6313                  * extent block (and not the root inode block).
6314                  */
6315                 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6316                         struct ocfs2_extent_block *eb =
6317                                 (struct ocfs2_extent_block *)bh->b_data;
6318
6319                         /*
6320                          * Save this for use when processing the
6321                          * parent block.
6322                          */
6323                         deleted_eb = le64_to_cpu(eb->h_blkno);
6324
6325                         mlog(0, "deleting this extent block.\n");
6326
6327                         ocfs2_remove_from_cache(inode, bh);
6328
6329                         BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6330                         BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6331                         BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6332
6333                         ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6334                         /* An error here is not fatal. */
6335                         if (ret < 0)
6336                                 mlog_errno(ret);
6337                 } else {
6338                         deleted_eb = 0;
6339                 }
6340
6341                 index--;
6342         }
6343
6344         ret = 0;
6345 out:
6346         return ret;
6347 }
6348
6349 static int ocfs2_do_truncate(struct ocfs2_super *osb,
6350                              unsigned int clusters_to_del,
6351                              struct inode *inode,
6352                              struct buffer_head *fe_bh,
6353                              handle_t *handle,
6354                              struct ocfs2_truncate_context *tc,
6355                              struct ocfs2_path *path)
6356 {
6357         int status;
6358         struct ocfs2_dinode *fe;
6359         struct ocfs2_extent_block *last_eb = NULL;
6360         struct ocfs2_extent_list *el;
6361         struct buffer_head *last_eb_bh = NULL;
6362         u64 delete_blk = 0;
6363
6364         fe = (struct ocfs2_dinode *) fe_bh->b_data;
6365
6366         status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6367                                              path, &last_eb_bh);
6368         if (status < 0) {
6369                 mlog_errno(status);
6370                 goto bail;
6371         }
6372
6373         /*
6374          * Each component will be touched, so we might as well journal
6375          * here to avoid having to handle errors later.
6376          */
6377         status = ocfs2_journal_access_path(inode, handle, path);
6378         if (status < 0) {
6379                 mlog_errno(status);
6380                 goto bail;
6381         }
6382
6383         if (last_eb_bh) {
6384                 status = ocfs2_journal_access(handle, inode, last_eb_bh,
6385                                               OCFS2_JOURNAL_ACCESS_WRITE);
6386                 if (status < 0) {
6387                         mlog_errno(status);
6388                         goto bail;
6389                 }
6390
6391                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6392         }
6393
6394         el = &(fe->id2.i_list);
6395
6396         /*
6397          * Lower levels depend on this never happening, but it's best
6398          * to check it up here before changing the tree.
6399          */
6400         if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6401                 ocfs2_error(inode->i_sb,
6402                             "Inode %lu has an empty extent record, depth %u\n",
6403                             inode->i_ino, le16_to_cpu(el->l_tree_depth));
6404                 status = -EROFS;
6405                 goto bail;
6406         }
6407
6408         spin_lock(&OCFS2_I(inode)->ip_lock);
6409         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6410                                       clusters_to_del;
6411         spin_unlock(&OCFS2_I(inode)->ip_lock);
6412         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6413         inode->i_blocks = ocfs2_inode_sector_count(inode);
6414
6415         status = ocfs2_trim_tree(inode, path, handle, tc,
6416                                  clusters_to_del, &delete_blk);
6417         if (status) {
6418                 mlog_errno(status);
6419                 goto bail;
6420         }
6421
6422         if (le32_to_cpu(fe->i_clusters) == 0) {
6423                 /* trunc to zero is a special case. */
6424                 el->l_tree_depth = 0;
6425                 fe->i_last_eb_blk = 0;
6426         } else if (last_eb)
6427                 fe->i_last_eb_blk = last_eb->h_blkno;
6428
6429         status = ocfs2_journal_dirty(handle, fe_bh);
6430         if (status < 0) {
6431                 mlog_errno(status);
6432                 goto bail;
6433         }
6434
6435         if (last_eb) {
6436                 /* If there will be a new last extent block, then by
6437                  * definition, there cannot be any leaves to the right of
6438                  * him. */
6439                 last_eb->h_next_leaf_blk = 0;
6440                 status = ocfs2_journal_dirty(handle, last_eb_bh);
6441                 if (status < 0) {
6442                         mlog_errno(status);
6443                         goto bail;
6444                 }
6445         }
6446
6447         if (delete_blk) {
6448                 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
6449                                                    clusters_to_del);
6450                 if (status < 0) {
6451                         mlog_errno(status);
6452                         goto bail;
6453                 }
6454         }
6455         status = 0;
6456 bail:
6457
6458         mlog_exit(status);
6459         return status;
6460 }
6461
6462 static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
6463 {
6464         set_buffer_uptodate(bh);
6465         mark_buffer_dirty(bh);
6466         return 0;
6467 }
6468
6469 static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
6470 {
6471         set_buffer_uptodate(bh);
6472         mark_buffer_dirty(bh);
6473         return ocfs2_journal_dirty_data(handle, bh);
6474 }
6475
6476 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6477                                      unsigned int from, unsigned int to,
6478                                      struct page *page, int zero, u64 *phys)
6479 {
6480         int ret, partial = 0;
6481
6482         ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6483         if (ret)
6484                 mlog_errno(ret);
6485
6486         if (zero)
6487                 zero_user_segment(page, from, to);
6488
6489         /*
6490          * Need to set the buffers we zero'd into uptodate
6491          * here if they aren't - ocfs2_map_page_blocks()
6492          * might've skipped some
6493          */
6494         if (ocfs2_should_order_data(inode)) {
6495                 ret = walk_page_buffers(handle,
6496                                         page_buffers(page),
6497                                         from, to, &partial,
6498                                         ocfs2_ordered_zero_func);
6499                 if (ret < 0)
6500                         mlog_errno(ret);
6501         } else {
6502                 ret = walk_page_buffers(handle, page_buffers(page),
6503                                         from, to, &partial,
6504                                         ocfs2_writeback_zero_func);
6505                 if (ret < 0)
6506                         mlog_errno(ret);
6507         }
6508
6509         if (!partial)
6510                 SetPageUptodate(page);
6511
6512         flush_dcache_page(page);
6513 }
6514
6515 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6516                                      loff_t end, struct page **pages,
6517                                      int numpages, u64 phys, handle_t *handle)
6518 {
6519         int i;
6520         struct page *page;
6521         unsigned int from, to = PAGE_CACHE_SIZE;
6522         struct super_block *sb = inode->i_sb;
6523
6524         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6525
6526         if (numpages == 0)
6527                 goto out;
6528
6529         to = PAGE_CACHE_SIZE;
6530         for(i = 0; i < numpages; i++) {
6531                 page = pages[i];
6532
6533                 from = start & (PAGE_CACHE_SIZE - 1);
6534                 if ((end >> PAGE_CACHE_SHIFT) == page->index)
6535                         to = end & (PAGE_CACHE_SIZE - 1);
6536
6537                 BUG_ON(from > PAGE_CACHE_SIZE);
6538                 BUG_ON(to > PAGE_CACHE_SIZE);
6539
6540                 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6541                                          &phys);
6542
6543                 start = (page->index + 1) << PAGE_CACHE_SHIFT;
6544         }
6545 out:
6546         if (pages)
6547                 ocfs2_unlock_and_free_pages(pages, numpages);
6548 }
6549
6550 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6551                                 struct page **pages, int *num)
6552 {
6553         int numpages, ret = 0;
6554         struct super_block *sb = inode->i_sb;
6555         struct address_space *mapping = inode->i_mapping;
6556         unsigned long index;
6557         loff_t last_page_bytes;
6558
6559         BUG_ON(start > end);
6560
6561         BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6562                (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6563
6564         numpages = 0;
6565         last_page_bytes = PAGE_ALIGN(end);
6566         index = start >> PAGE_CACHE_SHIFT;
6567         do {
6568                 pages[numpages] = grab_cache_page(mapping, index);
6569                 if (!pages[numpages]) {
6570                         ret = -ENOMEM;
6571                         mlog_errno(ret);
6572                         goto out;
6573                 }
6574
6575                 numpages++;
6576                 index++;
6577         } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6578
6579 out:
6580         if (ret != 0) {
6581                 if (pages)
6582                         ocfs2_unlock_and_free_pages(pages, numpages);
6583                 numpages = 0;
6584         }
6585
6586         *num = numpages;
6587
6588         return ret;
6589 }
6590
6591 /*
6592  * Zero the area past i_size but still within an allocated
6593  * cluster. This avoids exposing nonzero data on subsequent file
6594  * extends.
6595  *
6596  * We need to call this before i_size is updated on the inode because
6597  * otherwise block_write_full_page() will skip writeout of pages past
6598  * i_size. The new_i_size parameter is passed for this reason.
6599  */
6600 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6601                                   u64 range_start, u64 range_end)
6602 {
6603         int ret = 0, numpages;
6604         struct page **pages = NULL;
6605         u64 phys;
6606         unsigned int ext_flags;
6607         struct super_block *sb = inode->i_sb;
6608
6609         /*
6610          * File systems which don't support sparse files zero on every
6611          * extend.
6612          */
6613         if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6614                 return 0;
6615
6616         pages = kcalloc(ocfs2_pages_per_cluster(sb),
6617                         sizeof(struct page *), GFP_NOFS);
6618         if (pages == NULL) {
6619                 ret = -ENOMEM;
6620                 mlog_errno(ret);
6621                 goto out;
6622         }
6623
6624         if (range_start == range_end)
6625                 goto out;
6626
6627         ret = ocfs2_extent_map_get_blocks(inode,
6628                                           range_start >> sb->s_blocksize_bits,
6629                                           &phys, NULL, &ext_flags);
6630         if (ret) {
6631                 mlog_errno(ret);
6632                 goto out;
6633         }
6634
6635         /*
6636          * Tail is a hole, or is marked unwritten. In either case, we
6637          * can count on read and write to return/push zero's.
6638          */
6639         if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6640                 goto out;
6641
6642         ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6643                                    &numpages);
6644         if (ret) {
6645                 mlog_errno(ret);
6646                 goto out;
6647         }
6648
6649         ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6650                                  numpages, phys, handle);
6651
6652         /*
6653          * Initiate writeout of the pages we zero'd here. We don't
6654          * wait on them - the truncate_inode_pages() call later will
6655          * do that for us.
6656          */
6657         ret = do_sync_mapping_range(inode->i_mapping, range_start,
6658                                     range_end - 1, SYNC_FILE_RANGE_WRITE);
6659         if (ret)
6660                 mlog_errno(ret);
6661
6662 out:
6663         if (pages)
6664                 kfree(pages);
6665
6666         return ret;
6667 }
6668
6669 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
6670                                              struct ocfs2_dinode *di)
6671 {
6672         unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6673         unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
6674
6675         if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
6676                 memset(&di->id2, 0, blocksize -
6677                                     offsetof(struct ocfs2_dinode, id2) -
6678                                     xattrsize);
6679         else
6680                 memset(&di->id2, 0, blocksize -
6681                                     offsetof(struct ocfs2_dinode, id2));
6682 }
6683
6684 void ocfs2_dinode_new_extent_list(struct inode *inode,
6685                                   struct ocfs2_dinode *di)
6686 {
6687         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6688         di->id2.i_list.l_tree_depth = 0;
6689         di->id2.i_list.l_next_free_rec = 0;
6690         di->id2.i_list.l_count = cpu_to_le16(
6691                 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
6692 }
6693
6694 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6695 {
6696         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6697         struct ocfs2_inline_data *idata = &di->id2.i_data;
6698
6699         spin_lock(&oi->ip_lock);
6700         oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6701         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6702         spin_unlock(&oi->ip_lock);
6703
6704         /*
6705          * We clear the entire i_data structure here so that all
6706          * fields can be properly initialized.
6707          */
6708         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6709
6710         idata->id_count = cpu_to_le16(
6711                         ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
6712 }
6713
6714 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6715                                          struct buffer_head *di_bh)
6716 {
6717         int ret, i, has_data, num_pages = 0;
6718         handle_t *handle;
6719         u64 uninitialized_var(block);
6720         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6721         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6722         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6723         struct ocfs2_alloc_context *data_ac = NULL;
6724         struct page **pages = NULL;
6725         loff_t end = osb->s_clustersize;
6726
6727         has_data = i_size_read(inode) ? 1 : 0;
6728
6729         if (has_data) {
6730                 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6731                                 sizeof(struct page *), GFP_NOFS);
6732                 if (pages == NULL) {
6733                         ret = -ENOMEM;
6734                         mlog_errno(ret);
6735                         goto out;
6736                 }
6737
6738                 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6739                 if (ret) {
6740                         mlog_errno(ret);
6741                         goto out;
6742                 }
6743         }
6744
6745         handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
6746         if (IS_ERR(handle)) {
6747                 ret = PTR_ERR(handle);
6748                 mlog_errno(ret);
6749                 goto out_unlock;
6750         }
6751
6752         ret = ocfs2_journal_access(handle, inode, di_bh,
6753                                    OCFS2_JOURNAL_ACCESS_WRITE);
6754         if (ret) {
6755                 mlog_errno(ret);
6756                 goto out_commit;
6757         }
6758
6759         if (has_data) {
6760                 u32 bit_off, num;
6761                 unsigned int page_end;
6762                 u64 phys;
6763
6764                 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6765                                            &num);
6766                 if (ret) {
6767                         mlog_errno(ret);
6768                         goto out_commit;
6769                 }
6770
6771                 /*
6772                  * Save two copies, one for insert, and one that can
6773                  * be changed by ocfs2_map_and_dirty_page() below.
6774                  */
6775                 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6776
6777                 /*
6778                  * Non sparse file systems zero on extend, so no need
6779                  * to do that now.
6780                  */
6781                 if (!ocfs2_sparse_alloc(osb) &&
6782                     PAGE_CACHE_SIZE < osb->s_clustersize)
6783                         end = PAGE_CACHE_SIZE;
6784
6785                 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6786                 if (ret) {
6787                         mlog_errno(ret);
6788                         goto out_commit;
6789                 }
6790
6791                 /*
6792                  * This should populate the 1st page for us and mark
6793                  * it up to date.
6794                  */
6795                 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6796                 if (ret) {
6797                         mlog_errno(ret);
6798                         goto out_commit;
6799                 }
6800
6801                 page_end = PAGE_CACHE_SIZE;
6802                 if (PAGE_CACHE_SIZE > osb->s_clustersize)
6803                         page_end = osb->s_clustersize;
6804
6805                 for (i = 0; i < num_pages; i++)
6806                         ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
6807                                                  pages[i], i > 0, &phys);
6808         }
6809
6810         spin_lock(&oi->ip_lock);
6811         oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
6812         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6813         spin_unlock(&oi->ip_lock);
6814
6815         ocfs2_dinode_new_extent_list(inode, di);
6816
6817         ocfs2_journal_dirty(handle, di_bh);
6818
6819         if (has_data) {
6820                 /*
6821                  * An error at this point should be extremely rare. If
6822                  * this proves to be false, we could always re-build
6823                  * the in-inode data from our pages.
6824                  */
6825                 ret = ocfs2_dinode_insert_extent(osb, handle, inode, di_bh,
6826                                                  0, block, 1, 0, NULL);
6827                 if (ret) {
6828                         mlog_errno(ret);
6829                         goto out_commit;
6830                 }
6831
6832                 inode->i_blocks = ocfs2_inode_sector_count(inode);
6833         }
6834
6835 out_commit:
6836         ocfs2_commit_trans(osb, handle);
6837
6838 out_unlock:
6839         if (data_ac)
6840                 ocfs2_free_alloc_context(data_ac);
6841
6842 out:
6843         if (pages) {
6844                 ocfs2_unlock_and_free_pages(pages, num_pages);
6845                 kfree(pages);
6846         }
6847
6848         return ret;
6849 }
6850
6851 /*
6852  * It is expected, that by the time you call this function,
6853  * inode->i_size and fe->i_size have been adjusted.
6854  *
6855  * WARNING: This will kfree the truncate context
6856  */
6857 int ocfs2_commit_truncate(struct ocfs2_super *osb,
6858                           struct inode *inode,
6859                           struct buffer_head *fe_bh,
6860                           struct ocfs2_truncate_context *tc)
6861 {
6862         int status, i, credits, tl_sem = 0;
6863         u32 clusters_to_del, new_highest_cpos, range;
6864         struct ocfs2_extent_list *el;
6865         handle_t *handle = NULL;
6866         struct inode *tl_inode = osb->osb_tl_inode;
6867         struct ocfs2_path *path = NULL;
6868         struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
6869
6870         mlog_entry_void();
6871
6872         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6873                                                      i_size_read(inode));
6874
6875         path = ocfs2_new_path(fe_bh, &di->id2.i_list);
6876         if (!path) {
6877                 status = -ENOMEM;
6878                 mlog_errno(status);
6879                 goto bail;
6880         }
6881
6882         ocfs2_extent_map_trunc(inode, new_highest_cpos);
6883
6884 start:
6885         /*
6886          * Check that we still have allocation to delete.
6887          */
6888         if (OCFS2_I(inode)->ip_clusters == 0) {
6889                 status = 0;
6890                 goto bail;
6891         }
6892
6893         /*
6894          * Truncate always works against the rightmost tree branch.
6895          */
6896         status = ocfs2_find_path(inode, path, UINT_MAX);
6897         if (status) {
6898                 mlog_errno(status);
6899                 goto bail;
6900         }
6901
6902         mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
6903              OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
6904
6905         /*
6906          * By now, el will point to the extent list on the bottom most
6907          * portion of this tree. Only the tail record is considered in
6908          * each pass.
6909          *
6910          * We handle the following cases, in order:
6911          * - empty extent: delete the remaining branch
6912          * - remove the entire record
6913          * - remove a partial record
6914          * - no record needs to be removed (truncate has completed)
6915          */
6916         el = path_leaf_el(path);
6917         if (le16_to_cpu(el->l_next_free_rec) == 0) {
6918                 ocfs2_error(inode->i_sb,
6919                             "Inode %llu has empty extent block at %llu\n",
6920                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
6921                             (unsigned long long)path_leaf_bh(path)->b_blocknr);
6922                 status = -EROFS;
6923                 goto bail;
6924         }
6925
6926         i = le16_to_cpu(el->l_next_free_rec) - 1;
6927         range = le32_to_cpu(el->l_recs[i].e_cpos) +
6928                 ocfs2_rec_clusters(el, &el->l_recs[i]);
6929         if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
6930                 clusters_to_del = 0;
6931         } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
6932                 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
6933         } else if (range > new_highest_cpos) {
6934                 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
6935                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
6936                                   new_highest_cpos;
6937         } else {
6938                 status = 0;
6939                 goto bail;
6940         }
6941
6942         mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
6943              clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
6944
6945         mutex_lock(&tl_inode->i_mutex);
6946         tl_sem = 1;
6947         /* ocfs2_truncate_log_needs_flush guarantees us at least one
6948          * record is free for use. If there isn't any, we flush to get
6949          * an empty truncate log.  */
6950         if (ocfs2_truncate_log_needs_flush(osb)) {
6951                 status = __ocfs2_flush_truncate_log(osb);
6952                 if (status < 0) {
6953                         mlog_errno(status);
6954                         goto bail;
6955                 }
6956         }
6957
6958         credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
6959                                                 (struct ocfs2_dinode *)fe_bh->b_data,
6960                                                 el);
6961         handle = ocfs2_start_trans(osb, credits);
6962         if (IS_ERR(handle)) {
6963                 status = PTR_ERR(handle);
6964                 handle = NULL;
6965                 mlog_errno(status);
6966                 goto bail;
6967         }
6968
6969         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
6970                                    tc, path);
6971         if (status < 0) {
6972                 mlog_errno(status);
6973                 goto bail;
6974         }
6975
6976         mutex_unlock(&tl_inode->i_mutex);
6977         tl_sem = 0;
6978
6979         ocfs2_commit_trans(osb, handle);
6980         handle = NULL;
6981
6982         ocfs2_reinit_path(path, 1);
6983
6984         /*
6985          * The check above will catch the case where we've truncated
6986          * away all allocation.
6987          */
6988         goto start;
6989
6990 bail:
6991
6992         ocfs2_schedule_truncate_log_flush(osb, 1);
6993
6994         if (tl_sem)
6995                 mutex_unlock(&tl_inode->i_mutex);
6996
6997         if (handle)
6998                 ocfs2_commit_trans(osb, handle);
6999
7000         ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7001
7002         ocfs2_free_path(path);
7003
7004         /* This will drop the ext_alloc cluster lock for us */
7005         ocfs2_free_truncate_context(tc);
7006
7007         mlog_exit(status);
7008         return status;
7009 }
7010
7011 /*
7012  * Expects the inode to already be locked.
7013  */
7014 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
7015                            struct inode *inode,
7016                            struct buffer_head *fe_bh,
7017                            struct ocfs2_truncate_context **tc)
7018 {
7019         int status;
7020         unsigned int new_i_clusters;
7021         struct ocfs2_dinode *fe;
7022         struct ocfs2_extent_block *eb;
7023         struct buffer_head *last_eb_bh = NULL;
7024
7025         mlog_entry_void();
7026
7027         *tc = NULL;
7028
7029         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
7030                                                   i_size_read(inode));
7031         fe = (struct ocfs2_dinode *) fe_bh->b_data;
7032
7033         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
7034              "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
7035              (unsigned long long)le64_to_cpu(fe->i_size));
7036
7037         *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
7038         if (!(*tc)) {
7039                 status = -ENOMEM;
7040                 mlog_errno(status);
7041                 goto bail;
7042         }
7043         ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7044
7045         if (fe->id2.i_list.l_tree_depth) {
7046                 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
7047                                           &last_eb_bh, OCFS2_BH_CACHED, inode);
7048                 if (status < 0) {
7049                         mlog_errno(status);
7050                         goto bail;
7051                 }
7052                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
7053                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
7054                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
7055
7056                         brelse(last_eb_bh);
7057                         status = -EIO;
7058                         goto bail;
7059                 }
7060         }
7061
7062         (*tc)->tc_last_eb_bh = last_eb_bh;
7063
7064         status = 0;
7065 bail:
7066         if (status < 0) {
7067                 if (*tc)
7068                         ocfs2_free_truncate_context(*tc);
7069                 *tc = NULL;
7070         }
7071         mlog_exit_void();
7072         return status;
7073 }
7074
7075 /*
7076  * 'start' is inclusive, 'end' is not.
7077  */
7078 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7079                           unsigned int start, unsigned int end, int trunc)
7080 {
7081         int ret;
7082         unsigned int numbytes;
7083         handle_t *handle;
7084         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7085         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7086         struct ocfs2_inline_data *idata = &di->id2.i_data;
7087
7088         if (end > i_size_read(inode))
7089                 end = i_size_read(inode);
7090
7091         BUG_ON(start >= end);
7092
7093         if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7094             !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7095             !ocfs2_supports_inline_data(osb)) {
7096                 ocfs2_error(inode->i_sb,
7097                             "Inline data flags for inode %llu don't agree! "
7098                             "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7099                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7100                             le16_to_cpu(di->i_dyn_features),
7101                             OCFS2_I(inode)->ip_dyn_features,
7102                             osb->s_feature_incompat);
7103                 ret = -EROFS;
7104                 goto out;
7105         }
7106
7107         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7108         if (IS_ERR(handle)) {
7109                 ret = PTR_ERR(handle);
7110                 mlog_errno(ret);
7111                 goto out;
7112         }
7113
7114         ret = ocfs2_journal_access(handle, inode, di_bh,
7115                                    OCFS2_JOURNAL_ACCESS_WRITE);
7116         if (ret) {
7117                 mlog_errno(ret);
7118                 goto out_commit;
7119         }
7120
7121         numbytes = end - start;
7122         memset(idata->id_data + start, 0, numbytes);
7123
7124         /*
7125          * No need to worry about the data page here - it's been
7126          * truncated already and inline data doesn't need it for
7127          * pushing zero's to disk, so we'll let readpage pick it up
7128          * later.
7129          */
7130         if (trunc) {
7131                 i_size_write(inode, start);
7132                 di->i_size = cpu_to_le64(start);
7133         }
7134
7135         inode->i_blocks = ocfs2_inode_sector_count(inode);
7136         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7137
7138         di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7139         di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7140
7141         ocfs2_journal_dirty(handle, di_bh);
7142
7143 out_commit:
7144         ocfs2_commit_trans(osb, handle);
7145
7146 out:
7147         return ret;
7148 }
7149
7150 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7151 {
7152         /*
7153          * The caller is responsible for completing deallocation
7154          * before freeing the context.
7155          */
7156         if (tc->tc_dealloc.c_first_suballocator != NULL)
7157                 mlog(ML_NOTICE,
7158                      "Truncate completion has non-empty dealloc context\n");
7159
7160         if (tc->tc_last_eb_bh)
7161                 brelse(tc->tc_last_eb_bh);
7162
7163         kfree(tc);
7164 }