ocfs2: Make 'private' into 'object' on ocfs2_extent_tree.
[safe/jmp/linux-2.6] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "aops.h"
39 #include "dlmglue.h"
40 #include "extent_map.h"
41 #include "inode.h"
42 #include "journal.h"
43 #include "localalloc.h"
44 #include "suballoc.h"
45 #include "sysfile.h"
46 #include "file.h"
47 #include "super.h"
48 #include "uptodate.h"
49
50 #include "buffer_head_io.h"
51
52 /*
53  * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
54  * the b-tree operations in ocfs2. Now all the b-tree operations are not
55  * limited to ocfs2_dinode only. Any data which need to allocate clusters
56  * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
57  * and operation.
58  *
59  * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
60  * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
61  * functions.
62  * ocfs2_extent_tree_operations abstract the normal operations we do for
63  * the root of extent b-tree.
64  */
65 struct ocfs2_extent_tree;
66
67 struct ocfs2_extent_tree_operations {
68         void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
69                                    u64 blkno);
70         u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
71         void (*eo_update_clusters)(struct inode *inode,
72                                    struct ocfs2_extent_tree *et,
73                                    u32 new_clusters);
74         int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
75 };
76
77 struct ocfs2_extent_tree {
78         enum ocfs2_extent_tree_type             et_type;
79         struct ocfs2_extent_tree_operations     *et_ops;
80         struct buffer_head                      *et_root_bh;
81         struct ocfs2_extent_list                *et_root_el;
82         void                                    *et_object;
83         unsigned int                            et_max_leaf_clusters;
84 };
85
86 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
87                                          u64 blkno)
88 {
89         struct ocfs2_dinode *di = et->et_object;
90
91         BUG_ON(et->et_type != OCFS2_DINODE_EXTENT);
92         di->i_last_eb_blk = cpu_to_le64(blkno);
93 }
94
95 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
96 {
97         struct ocfs2_dinode *di = et->et_object;
98
99         BUG_ON(et->et_type != OCFS2_DINODE_EXTENT);
100         return le64_to_cpu(di->i_last_eb_blk);
101 }
102
103 static void ocfs2_dinode_update_clusters(struct inode *inode,
104                                          struct ocfs2_extent_tree *et,
105                                          u32 clusters)
106 {
107         struct ocfs2_dinode *di = et->et_object;
108
109         le32_add_cpu(&di->i_clusters, clusters);
110         spin_lock(&OCFS2_I(inode)->ip_lock);
111         OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
112         spin_unlock(&OCFS2_I(inode)->ip_lock);
113 }
114
115 static int ocfs2_dinode_sanity_check(struct inode *inode,
116                                      struct ocfs2_extent_tree *et)
117 {
118         int ret = 0;
119         struct ocfs2_dinode *di;
120
121         BUG_ON(et->et_type != OCFS2_DINODE_EXTENT);
122
123         di = et->et_object;
124         if (!OCFS2_IS_VALID_DINODE(di)) {
125                 ret = -EIO;
126                 ocfs2_error(inode->i_sb,
127                         "Inode %llu has invalid path root",
128                         (unsigned long long)OCFS2_I(inode)->ip_blkno);
129         }
130
131         return ret;
132 }
133
134 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
135         .eo_set_last_eb_blk     = ocfs2_dinode_set_last_eb_blk,
136         .eo_get_last_eb_blk     = ocfs2_dinode_get_last_eb_blk,
137         .eo_update_clusters     = ocfs2_dinode_update_clusters,
138         .eo_sanity_check        = ocfs2_dinode_sanity_check,
139 };
140
141 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
142                                               u64 blkno)
143 {
144         struct ocfs2_xattr_value_root *xv =
145                 (struct ocfs2_xattr_value_root *)et->et_object;
146
147         xv->xr_last_eb_blk = cpu_to_le64(blkno);
148 }
149
150 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
151 {
152         struct ocfs2_xattr_value_root *xv =
153                 (struct ocfs2_xattr_value_root *) et->et_object;
154
155         return le64_to_cpu(xv->xr_last_eb_blk);
156 }
157
158 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
159                                               struct ocfs2_extent_tree *et,
160                                               u32 clusters)
161 {
162         struct ocfs2_xattr_value_root *xv =
163                 (struct ocfs2_xattr_value_root *)et->et_object;
164
165         le32_add_cpu(&xv->xr_clusters, clusters);
166 }
167
168 static int ocfs2_xattr_value_sanity_check(struct inode *inode,
169                                           struct ocfs2_extent_tree *et)
170 {
171         return 0;
172 }
173
174 static struct ocfs2_extent_tree_operations ocfs2_xattr_et_ops = {
175         .eo_set_last_eb_blk     = ocfs2_xattr_value_set_last_eb_blk,
176         .eo_get_last_eb_blk     = ocfs2_xattr_value_get_last_eb_blk,
177         .eo_update_clusters     = ocfs2_xattr_value_update_clusters,
178         .eo_sanity_check        = ocfs2_xattr_value_sanity_check,
179 };
180
181 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
182                                              u64 blkno)
183 {
184         struct ocfs2_xattr_block *xb = et->et_object;
185         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
186
187         xt->xt_last_eb_blk = cpu_to_le64(blkno);
188 }
189
190 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
191 {
192         struct ocfs2_xattr_block *xb = et->et_object;
193         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
194
195         return le64_to_cpu(xt->xt_last_eb_blk);
196 }
197
198 static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
199                                              struct ocfs2_extent_tree *et,
200                                              u32 clusters)
201 {
202         struct ocfs2_xattr_block *xb = et->et_object;
203
204         le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
205 }
206
207 static int ocfs2_xattr_tree_sanity_check(struct inode *inode,
208                                          struct ocfs2_extent_tree *et)
209 {
210         return 0;
211 }
212
213 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
214         .eo_set_last_eb_blk     = ocfs2_xattr_tree_set_last_eb_blk,
215         .eo_get_last_eb_blk     = ocfs2_xattr_tree_get_last_eb_blk,
216         .eo_update_clusters     = ocfs2_xattr_tree_update_clusters,
217         .eo_sanity_check        = ocfs2_xattr_tree_sanity_check,
218 };
219
220 static void ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
221                                   struct inode *inode,
222                                   struct buffer_head *bh,
223                                   enum ocfs2_extent_tree_type et_type,
224                                   void *obj)
225 {
226         et->et_type = et_type;
227         get_bh(bh);
228         et->et_root_bh = bh;
229         et->et_max_leaf_clusters = 0;
230         if (!obj)
231                 obj = (void *)bh->b_data;
232         et->et_object = obj;
233
234         if (et_type == OCFS2_DINODE_EXTENT) {
235                 et->et_root_el =
236                         &((struct ocfs2_dinode *)obj)->id2.i_list;
237                 et->et_ops = &ocfs2_dinode_et_ops;
238         } else if (et_type == OCFS2_XATTR_VALUE_EXTENT) {
239                 struct ocfs2_xattr_value_root *xv =
240                         (struct ocfs2_xattr_value_root *)obj;
241                 et->et_root_el = &xv->xr_list;
242                 et->et_ops = &ocfs2_xattr_et_ops;
243         } else if (et_type == OCFS2_XATTR_TREE_EXTENT) {
244                 struct ocfs2_xattr_block *xb =
245                         (struct ocfs2_xattr_block *)obj;
246                 et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
247                 et->et_ops = &ocfs2_xattr_tree_et_ops;
248                 et->et_max_leaf_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
249                                                 OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
250         }
251 }
252
253 static void ocfs2_put_extent_tree(struct ocfs2_extent_tree *et)
254 {
255         brelse(et->et_root_bh);
256 }
257
258 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
259                                             u64 new_last_eb_blk)
260 {
261         et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
262 }
263
264 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
265 {
266         return et->et_ops->eo_get_last_eb_blk(et);
267 }
268
269 static inline void ocfs2_et_update_clusters(struct inode *inode,
270                                             struct ocfs2_extent_tree *et,
271                                             u32 clusters)
272 {
273         et->et_ops->eo_update_clusters(inode, et, clusters);
274 }
275
276 static inline int ocfs2_et_sanity_check(struct inode *inode,
277                                         struct ocfs2_extent_tree *et)
278 {
279         return et->et_ops->eo_sanity_check(inode, et);
280 }
281
282 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
283 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
284                                          struct ocfs2_extent_block *eb);
285
286 /*
287  * Structures which describe a path through a btree, and functions to
288  * manipulate them.
289  *
290  * The idea here is to be as generic as possible with the tree
291  * manipulation code.
292  */
293 struct ocfs2_path_item {
294         struct buffer_head              *bh;
295         struct ocfs2_extent_list        *el;
296 };
297
298 #define OCFS2_MAX_PATH_DEPTH    5
299
300 struct ocfs2_path {
301         int                     p_tree_depth;
302         struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
303 };
304
305 #define path_root_bh(_path) ((_path)->p_node[0].bh)
306 #define path_root_el(_path) ((_path)->p_node[0].el)
307 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
308 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
309 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
310
311 /*
312  * Reset the actual path elements so that we can re-use the structure
313  * to build another path. Generally, this involves freeing the buffer
314  * heads.
315  */
316 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
317 {
318         int i, start = 0, depth = 0;
319         struct ocfs2_path_item *node;
320
321         if (keep_root)
322                 start = 1;
323
324         for(i = start; i < path_num_items(path); i++) {
325                 node = &path->p_node[i];
326
327                 brelse(node->bh);
328                 node->bh = NULL;
329                 node->el = NULL;
330         }
331
332         /*
333          * Tree depth may change during truncate, or insert. If we're
334          * keeping the root extent list, then make sure that our path
335          * structure reflects the proper depth.
336          */
337         if (keep_root)
338                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
339
340         path->p_tree_depth = depth;
341 }
342
343 static void ocfs2_free_path(struct ocfs2_path *path)
344 {
345         if (path) {
346                 ocfs2_reinit_path(path, 0);
347                 kfree(path);
348         }
349 }
350
351 /*
352  * All the elements of src into dest. After this call, src could be freed
353  * without affecting dest.
354  *
355  * Both paths should have the same root. Any non-root elements of dest
356  * will be freed.
357  */
358 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
359 {
360         int i;
361
362         BUG_ON(path_root_bh(dest) != path_root_bh(src));
363         BUG_ON(path_root_el(dest) != path_root_el(src));
364
365         ocfs2_reinit_path(dest, 1);
366
367         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
368                 dest->p_node[i].bh = src->p_node[i].bh;
369                 dest->p_node[i].el = src->p_node[i].el;
370
371                 if (dest->p_node[i].bh)
372                         get_bh(dest->p_node[i].bh);
373         }
374 }
375
376 /*
377  * Make the *dest path the same as src and re-initialize src path to
378  * have a root only.
379  */
380 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
381 {
382         int i;
383
384         BUG_ON(path_root_bh(dest) != path_root_bh(src));
385
386         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
387                 brelse(dest->p_node[i].bh);
388
389                 dest->p_node[i].bh = src->p_node[i].bh;
390                 dest->p_node[i].el = src->p_node[i].el;
391
392                 src->p_node[i].bh = NULL;
393                 src->p_node[i].el = NULL;
394         }
395 }
396
397 /*
398  * Insert an extent block at given index.
399  *
400  * This will not take an additional reference on eb_bh.
401  */
402 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
403                                         struct buffer_head *eb_bh)
404 {
405         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
406
407         /*
408          * Right now, no root bh is an extent block, so this helps
409          * catch code errors with dinode trees. The assertion can be
410          * safely removed if we ever need to insert extent block
411          * structures at the root.
412          */
413         BUG_ON(index == 0);
414
415         path->p_node[index].bh = eb_bh;
416         path->p_node[index].el = &eb->h_list;
417 }
418
419 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
420                                          struct ocfs2_extent_list *root_el)
421 {
422         struct ocfs2_path *path;
423
424         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
425
426         path = kzalloc(sizeof(*path), GFP_NOFS);
427         if (path) {
428                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
429                 get_bh(root_bh);
430                 path_root_bh(path) = root_bh;
431                 path_root_el(path) = root_el;
432         }
433
434         return path;
435 }
436
437 /*
438  * Convenience function to journal all components in a path.
439  */
440 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
441                                      struct ocfs2_path *path)
442 {
443         int i, ret = 0;
444
445         if (!path)
446                 goto out;
447
448         for(i = 0; i < path_num_items(path); i++) {
449                 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
450                                            OCFS2_JOURNAL_ACCESS_WRITE);
451                 if (ret < 0) {
452                         mlog_errno(ret);
453                         goto out;
454                 }
455         }
456
457 out:
458         return ret;
459 }
460
461 /*
462  * Return the index of the extent record which contains cluster #v_cluster.
463  * -1 is returned if it was not found.
464  *
465  * Should work fine on interior and exterior nodes.
466  */
467 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
468 {
469         int ret = -1;
470         int i;
471         struct ocfs2_extent_rec *rec;
472         u32 rec_end, rec_start, clusters;
473
474         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
475                 rec = &el->l_recs[i];
476
477                 rec_start = le32_to_cpu(rec->e_cpos);
478                 clusters = ocfs2_rec_clusters(el, rec);
479
480                 rec_end = rec_start + clusters;
481
482                 if (v_cluster >= rec_start && v_cluster < rec_end) {
483                         ret = i;
484                         break;
485                 }
486         }
487
488         return ret;
489 }
490
491 enum ocfs2_contig_type {
492         CONTIG_NONE = 0,
493         CONTIG_LEFT,
494         CONTIG_RIGHT,
495         CONTIG_LEFTRIGHT,
496 };
497
498
499 /*
500  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
501  * ocfs2_extent_contig only work properly against leaf nodes!
502  */
503 static int ocfs2_block_extent_contig(struct super_block *sb,
504                                      struct ocfs2_extent_rec *ext,
505                                      u64 blkno)
506 {
507         u64 blk_end = le64_to_cpu(ext->e_blkno);
508
509         blk_end += ocfs2_clusters_to_blocks(sb,
510                                     le16_to_cpu(ext->e_leaf_clusters));
511
512         return blkno == blk_end;
513 }
514
515 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
516                                   struct ocfs2_extent_rec *right)
517 {
518         u32 left_range;
519
520         left_range = le32_to_cpu(left->e_cpos) +
521                 le16_to_cpu(left->e_leaf_clusters);
522
523         return (left_range == le32_to_cpu(right->e_cpos));
524 }
525
526 static enum ocfs2_contig_type
527         ocfs2_extent_contig(struct inode *inode,
528                             struct ocfs2_extent_rec *ext,
529                             struct ocfs2_extent_rec *insert_rec)
530 {
531         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
532
533         /*
534          * Refuse to coalesce extent records with different flag
535          * fields - we don't want to mix unwritten extents with user
536          * data.
537          */
538         if (ext->e_flags != insert_rec->e_flags)
539                 return CONTIG_NONE;
540
541         if (ocfs2_extents_adjacent(ext, insert_rec) &&
542             ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
543                         return CONTIG_RIGHT;
544
545         blkno = le64_to_cpu(ext->e_blkno);
546         if (ocfs2_extents_adjacent(insert_rec, ext) &&
547             ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
548                 return CONTIG_LEFT;
549
550         return CONTIG_NONE;
551 }
552
553 /*
554  * NOTE: We can have pretty much any combination of contiguousness and
555  * appending.
556  *
557  * The usefulness of APPEND_TAIL is more in that it lets us know that
558  * we'll have to update the path to that leaf.
559  */
560 enum ocfs2_append_type {
561         APPEND_NONE = 0,
562         APPEND_TAIL,
563 };
564
565 enum ocfs2_split_type {
566         SPLIT_NONE = 0,
567         SPLIT_LEFT,
568         SPLIT_RIGHT,
569 };
570
571 struct ocfs2_insert_type {
572         enum ocfs2_split_type   ins_split;
573         enum ocfs2_append_type  ins_appending;
574         enum ocfs2_contig_type  ins_contig;
575         int                     ins_contig_index;
576         int                     ins_tree_depth;
577 };
578
579 struct ocfs2_merge_ctxt {
580         enum ocfs2_contig_type  c_contig_type;
581         int                     c_has_empty_extent;
582         int                     c_split_covers_rec;
583 };
584
585 /*
586  * How many free extents have we got before we need more meta data?
587  */
588 int ocfs2_num_free_extents(struct ocfs2_super *osb,
589                            struct inode *inode,
590                            struct buffer_head *root_bh,
591                            enum ocfs2_extent_tree_type type,
592                            void *obj)
593 {
594         int retval;
595         struct ocfs2_extent_list *el = NULL;
596         struct ocfs2_extent_block *eb;
597         struct buffer_head *eb_bh = NULL;
598         u64 last_eb_blk = 0;
599
600         mlog_entry_void();
601
602         if (type == OCFS2_DINODE_EXTENT) {
603                 struct ocfs2_dinode *fe =
604                                 (struct ocfs2_dinode *)root_bh->b_data;
605                 if (!OCFS2_IS_VALID_DINODE(fe)) {
606                         OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
607                         retval = -EIO;
608                         goto bail;
609                 }
610
611                 if (fe->i_last_eb_blk)
612                         last_eb_blk = le64_to_cpu(fe->i_last_eb_blk);
613                 el = &fe->id2.i_list;
614         } else if (type == OCFS2_XATTR_VALUE_EXTENT) {
615                 struct ocfs2_xattr_value_root *xv =
616                         (struct ocfs2_xattr_value_root *) obj;
617
618                 last_eb_blk = le64_to_cpu(xv->xr_last_eb_blk);
619                 el = &xv->xr_list;
620         } else if (type == OCFS2_XATTR_TREE_EXTENT) {
621                 struct ocfs2_xattr_block *xb =
622                         (struct ocfs2_xattr_block *)root_bh->b_data;
623
624                 last_eb_blk = le64_to_cpu(xb->xb_attrs.xb_root.xt_last_eb_blk);
625                 el = &xb->xb_attrs.xb_root.xt_list;
626         }
627
628         if (last_eb_blk) {
629                 retval = ocfs2_read_block(osb, last_eb_blk,
630                                           &eb_bh, OCFS2_BH_CACHED, inode);
631                 if (retval < 0) {
632                         mlog_errno(retval);
633                         goto bail;
634                 }
635                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
636                 el = &eb->h_list;
637         }
638
639         BUG_ON(el->l_tree_depth != 0);
640
641         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
642 bail:
643         if (eb_bh)
644                 brelse(eb_bh);
645
646         mlog_exit(retval);
647         return retval;
648 }
649
650 /* expects array to already be allocated
651  *
652  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
653  * l_count for you
654  */
655 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
656                                      handle_t *handle,
657                                      struct inode *inode,
658                                      int wanted,
659                                      struct ocfs2_alloc_context *meta_ac,
660                                      struct buffer_head *bhs[])
661 {
662         int count, status, i;
663         u16 suballoc_bit_start;
664         u32 num_got;
665         u64 first_blkno;
666         struct ocfs2_extent_block *eb;
667
668         mlog_entry_void();
669
670         count = 0;
671         while (count < wanted) {
672                 status = ocfs2_claim_metadata(osb,
673                                               handle,
674                                               meta_ac,
675                                               wanted - count,
676                                               &suballoc_bit_start,
677                                               &num_got,
678                                               &first_blkno);
679                 if (status < 0) {
680                         mlog_errno(status);
681                         goto bail;
682                 }
683
684                 for(i = count;  i < (num_got + count); i++) {
685                         bhs[i] = sb_getblk(osb->sb, first_blkno);
686                         if (bhs[i] == NULL) {
687                                 status = -EIO;
688                                 mlog_errno(status);
689                                 goto bail;
690                         }
691                         ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
692
693                         status = ocfs2_journal_access(handle, inode, bhs[i],
694                                                       OCFS2_JOURNAL_ACCESS_CREATE);
695                         if (status < 0) {
696                                 mlog_errno(status);
697                                 goto bail;
698                         }
699
700                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
701                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
702                         /* Ok, setup the minimal stuff here. */
703                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
704                         eb->h_blkno = cpu_to_le64(first_blkno);
705                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
706                         eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
707                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
708                         eb->h_list.l_count =
709                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
710
711                         suballoc_bit_start++;
712                         first_blkno++;
713
714                         /* We'll also be dirtied by the caller, so
715                          * this isn't absolutely necessary. */
716                         status = ocfs2_journal_dirty(handle, bhs[i]);
717                         if (status < 0) {
718                                 mlog_errno(status);
719                                 goto bail;
720                         }
721                 }
722
723                 count += num_got;
724         }
725
726         status = 0;
727 bail:
728         if (status < 0) {
729                 for(i = 0; i < wanted; i++) {
730                         if (bhs[i])
731                                 brelse(bhs[i]);
732                         bhs[i] = NULL;
733                 }
734         }
735         mlog_exit(status);
736         return status;
737 }
738
739 /*
740  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
741  *
742  * Returns the sum of the rightmost extent rec logical offset and
743  * cluster count.
744  *
745  * ocfs2_add_branch() uses this to determine what logical cluster
746  * value should be populated into the leftmost new branch records.
747  *
748  * ocfs2_shift_tree_depth() uses this to determine the # clusters
749  * value for the new topmost tree record.
750  */
751 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
752 {
753         int i;
754
755         i = le16_to_cpu(el->l_next_free_rec) - 1;
756
757         return le32_to_cpu(el->l_recs[i].e_cpos) +
758                 ocfs2_rec_clusters(el, &el->l_recs[i]);
759 }
760
761 /*
762  * Add an entire tree branch to our inode. eb_bh is the extent block
763  * to start at, if we don't want to start the branch at the dinode
764  * structure.
765  *
766  * last_eb_bh is required as we have to update it's next_leaf pointer
767  * for the new last extent block.
768  *
769  * the new branch will be 'empty' in the sense that every block will
770  * contain a single record with cluster count == 0.
771  */
772 static int ocfs2_add_branch(struct ocfs2_super *osb,
773                             handle_t *handle,
774                             struct inode *inode,
775                             struct ocfs2_extent_tree *et,
776                             struct buffer_head *eb_bh,
777                             struct buffer_head **last_eb_bh,
778                             struct ocfs2_alloc_context *meta_ac)
779 {
780         int status, new_blocks, i;
781         u64 next_blkno, new_last_eb_blk;
782         struct buffer_head *bh;
783         struct buffer_head **new_eb_bhs = NULL;
784         struct ocfs2_extent_block *eb;
785         struct ocfs2_extent_list  *eb_el;
786         struct ocfs2_extent_list  *el;
787         u32 new_cpos;
788
789         mlog_entry_void();
790
791         BUG_ON(!last_eb_bh || !*last_eb_bh);
792
793         if (eb_bh) {
794                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
795                 el = &eb->h_list;
796         } else
797                 el = et->et_root_el;
798
799         /* we never add a branch to a leaf. */
800         BUG_ON(!el->l_tree_depth);
801
802         new_blocks = le16_to_cpu(el->l_tree_depth);
803
804         /* allocate the number of new eb blocks we need */
805         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
806                              GFP_KERNEL);
807         if (!new_eb_bhs) {
808                 status = -ENOMEM;
809                 mlog_errno(status);
810                 goto bail;
811         }
812
813         status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
814                                            meta_ac, new_eb_bhs);
815         if (status < 0) {
816                 mlog_errno(status);
817                 goto bail;
818         }
819
820         eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
821         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
822
823         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
824          * linked with the rest of the tree.
825          * conversly, new_eb_bhs[0] is the new bottommost leaf.
826          *
827          * when we leave the loop, new_last_eb_blk will point to the
828          * newest leaf, and next_blkno will point to the topmost extent
829          * block. */
830         next_blkno = new_last_eb_blk = 0;
831         for(i = 0; i < new_blocks; i++) {
832                 bh = new_eb_bhs[i];
833                 eb = (struct ocfs2_extent_block *) bh->b_data;
834                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
835                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
836                         status = -EIO;
837                         goto bail;
838                 }
839                 eb_el = &eb->h_list;
840
841                 status = ocfs2_journal_access(handle, inode, bh,
842                                               OCFS2_JOURNAL_ACCESS_CREATE);
843                 if (status < 0) {
844                         mlog_errno(status);
845                         goto bail;
846                 }
847
848                 eb->h_next_leaf_blk = 0;
849                 eb_el->l_tree_depth = cpu_to_le16(i);
850                 eb_el->l_next_free_rec = cpu_to_le16(1);
851                 /*
852                  * This actually counts as an empty extent as
853                  * c_clusters == 0
854                  */
855                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
856                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
857                 /*
858                  * eb_el isn't always an interior node, but even leaf
859                  * nodes want a zero'd flags and reserved field so
860                  * this gets the whole 32 bits regardless of use.
861                  */
862                 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
863                 if (!eb_el->l_tree_depth)
864                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
865
866                 status = ocfs2_journal_dirty(handle, bh);
867                 if (status < 0) {
868                         mlog_errno(status);
869                         goto bail;
870                 }
871
872                 next_blkno = le64_to_cpu(eb->h_blkno);
873         }
874
875         /* This is a bit hairy. We want to update up to three blocks
876          * here without leaving any of them in an inconsistent state
877          * in case of error. We don't have to worry about
878          * journal_dirty erroring as it won't unless we've aborted the
879          * handle (in which case we would never be here) so reserving
880          * the write with journal_access is all we need to do. */
881         status = ocfs2_journal_access(handle, inode, *last_eb_bh,
882                                       OCFS2_JOURNAL_ACCESS_WRITE);
883         if (status < 0) {
884                 mlog_errno(status);
885                 goto bail;
886         }
887         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
888                                       OCFS2_JOURNAL_ACCESS_WRITE);
889         if (status < 0) {
890                 mlog_errno(status);
891                 goto bail;
892         }
893         if (eb_bh) {
894                 status = ocfs2_journal_access(handle, inode, eb_bh,
895                                               OCFS2_JOURNAL_ACCESS_WRITE);
896                 if (status < 0) {
897                         mlog_errno(status);
898                         goto bail;
899                 }
900         }
901
902         /* Link the new branch into the rest of the tree (el will
903          * either be on the root_bh, or the extent block passed in. */
904         i = le16_to_cpu(el->l_next_free_rec);
905         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
906         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
907         el->l_recs[i].e_int_clusters = 0;
908         le16_add_cpu(&el->l_next_free_rec, 1);
909
910         /* fe needs a new last extent block pointer, as does the
911          * next_leaf on the previously last-extent-block. */
912         ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
913
914         eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
915         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
916
917         status = ocfs2_journal_dirty(handle, *last_eb_bh);
918         if (status < 0)
919                 mlog_errno(status);
920         status = ocfs2_journal_dirty(handle, et->et_root_bh);
921         if (status < 0)
922                 mlog_errno(status);
923         if (eb_bh) {
924                 status = ocfs2_journal_dirty(handle, eb_bh);
925                 if (status < 0)
926                         mlog_errno(status);
927         }
928
929         /*
930          * Some callers want to track the rightmost leaf so pass it
931          * back here.
932          */
933         brelse(*last_eb_bh);
934         get_bh(new_eb_bhs[0]);
935         *last_eb_bh = new_eb_bhs[0];
936
937         status = 0;
938 bail:
939         if (new_eb_bhs) {
940                 for (i = 0; i < new_blocks; i++)
941                         if (new_eb_bhs[i])
942                                 brelse(new_eb_bhs[i]);
943                 kfree(new_eb_bhs);
944         }
945
946         mlog_exit(status);
947         return status;
948 }
949
950 /*
951  * adds another level to the allocation tree.
952  * returns back the new extent block so you can add a branch to it
953  * after this call.
954  */
955 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
956                                   handle_t *handle,
957                                   struct inode *inode,
958                                   struct ocfs2_extent_tree *et,
959                                   struct ocfs2_alloc_context *meta_ac,
960                                   struct buffer_head **ret_new_eb_bh)
961 {
962         int status, i;
963         u32 new_clusters;
964         struct buffer_head *new_eb_bh = NULL;
965         struct ocfs2_extent_block *eb;
966         struct ocfs2_extent_list  *root_el;
967         struct ocfs2_extent_list  *eb_el;
968
969         mlog_entry_void();
970
971         status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
972                                            &new_eb_bh);
973         if (status < 0) {
974                 mlog_errno(status);
975                 goto bail;
976         }
977
978         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
979         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
980                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
981                 status = -EIO;
982                 goto bail;
983         }
984
985         eb_el = &eb->h_list;
986         root_el = et->et_root_el;
987
988         status = ocfs2_journal_access(handle, inode, new_eb_bh,
989                                       OCFS2_JOURNAL_ACCESS_CREATE);
990         if (status < 0) {
991                 mlog_errno(status);
992                 goto bail;
993         }
994
995         /* copy the root extent list data into the new extent block */
996         eb_el->l_tree_depth = root_el->l_tree_depth;
997         eb_el->l_next_free_rec = root_el->l_next_free_rec;
998         for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
999                 eb_el->l_recs[i] = root_el->l_recs[i];
1000
1001         status = ocfs2_journal_dirty(handle, new_eb_bh);
1002         if (status < 0) {
1003                 mlog_errno(status);
1004                 goto bail;
1005         }
1006
1007         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
1008                                       OCFS2_JOURNAL_ACCESS_WRITE);
1009         if (status < 0) {
1010                 mlog_errno(status);
1011                 goto bail;
1012         }
1013
1014         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1015
1016         /* update root_bh now */
1017         le16_add_cpu(&root_el->l_tree_depth, 1);
1018         root_el->l_recs[0].e_cpos = 0;
1019         root_el->l_recs[0].e_blkno = eb->h_blkno;
1020         root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1021         for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1022                 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1023         root_el->l_next_free_rec = cpu_to_le16(1);
1024
1025         /* If this is our 1st tree depth shift, then last_eb_blk
1026          * becomes the allocated extent block */
1027         if (root_el->l_tree_depth == cpu_to_le16(1))
1028                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1029
1030         status = ocfs2_journal_dirty(handle, et->et_root_bh);
1031         if (status < 0) {
1032                 mlog_errno(status);
1033                 goto bail;
1034         }
1035
1036         *ret_new_eb_bh = new_eb_bh;
1037         new_eb_bh = NULL;
1038         status = 0;
1039 bail:
1040         if (new_eb_bh)
1041                 brelse(new_eb_bh);
1042
1043         mlog_exit(status);
1044         return status;
1045 }
1046
1047 /*
1048  * Should only be called when there is no space left in any of the
1049  * leaf nodes. What we want to do is find the lowest tree depth
1050  * non-leaf extent block with room for new records. There are three
1051  * valid results of this search:
1052  *
1053  * 1) a lowest extent block is found, then we pass it back in
1054  *    *lowest_eb_bh and return '0'
1055  *
1056  * 2) the search fails to find anything, but the root_el has room. We
1057  *    pass NULL back in *lowest_eb_bh, but still return '0'
1058  *
1059  * 3) the search fails to find anything AND the root_el is full, in
1060  *    which case we return > 0
1061  *
1062  * return status < 0 indicates an error.
1063  */
1064 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1065                                     struct inode *inode,
1066                                     struct ocfs2_extent_tree *et,
1067                                     struct buffer_head **target_bh)
1068 {
1069         int status = 0, i;
1070         u64 blkno;
1071         struct ocfs2_extent_block *eb;
1072         struct ocfs2_extent_list  *el;
1073         struct buffer_head *bh = NULL;
1074         struct buffer_head *lowest_bh = NULL;
1075
1076         mlog_entry_void();
1077
1078         *target_bh = NULL;
1079
1080         el = et->et_root_el;
1081
1082         while(le16_to_cpu(el->l_tree_depth) > 1) {
1083                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1084                         ocfs2_error(inode->i_sb, "Dinode %llu has empty "
1085                                     "extent list (next_free_rec == 0)",
1086                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
1087                         status = -EIO;
1088                         goto bail;
1089                 }
1090                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1091                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1092                 if (!blkno) {
1093                         ocfs2_error(inode->i_sb, "Dinode %llu has extent "
1094                                     "list where extent # %d has no physical "
1095                                     "block start",
1096                                     (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
1097                         status = -EIO;
1098                         goto bail;
1099                 }
1100
1101                 if (bh) {
1102                         brelse(bh);
1103                         bh = NULL;
1104                 }
1105
1106                 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
1107                                           inode);
1108                 if (status < 0) {
1109                         mlog_errno(status);
1110                         goto bail;
1111                 }
1112
1113                 eb = (struct ocfs2_extent_block *) bh->b_data;
1114                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1115                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1116                         status = -EIO;
1117                         goto bail;
1118                 }
1119                 el = &eb->h_list;
1120
1121                 if (le16_to_cpu(el->l_next_free_rec) <
1122                     le16_to_cpu(el->l_count)) {
1123                         if (lowest_bh)
1124                                 brelse(lowest_bh);
1125                         lowest_bh = bh;
1126                         get_bh(lowest_bh);
1127                 }
1128         }
1129
1130         /* If we didn't find one and the fe doesn't have any room,
1131          * then return '1' */
1132         el = et->et_root_el;
1133         if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1134                 status = 1;
1135
1136         *target_bh = lowest_bh;
1137 bail:
1138         if (bh)
1139                 brelse(bh);
1140
1141         mlog_exit(status);
1142         return status;
1143 }
1144
1145 /*
1146  * Grow a b-tree so that it has more records.
1147  *
1148  * We might shift the tree depth in which case existing paths should
1149  * be considered invalid.
1150  *
1151  * Tree depth after the grow is returned via *final_depth.
1152  *
1153  * *last_eb_bh will be updated by ocfs2_add_branch().
1154  */
1155 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1156                            struct ocfs2_extent_tree *et, int *final_depth,
1157                            struct buffer_head **last_eb_bh,
1158                            struct ocfs2_alloc_context *meta_ac)
1159 {
1160         int ret, shift;
1161         struct ocfs2_extent_list *el = et->et_root_el;
1162         int depth = le16_to_cpu(el->l_tree_depth);
1163         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1164         struct buffer_head *bh = NULL;
1165
1166         BUG_ON(meta_ac == NULL);
1167
1168         shift = ocfs2_find_branch_target(osb, inode, et, &bh);
1169         if (shift < 0) {
1170                 ret = shift;
1171                 mlog_errno(ret);
1172                 goto out;
1173         }
1174
1175         /* We traveled all the way to the bottom of the allocation tree
1176          * and didn't find room for any more extents - we need to add
1177          * another tree level */
1178         if (shift) {
1179                 BUG_ON(bh);
1180                 mlog(0, "need to shift tree depth (current = %d)\n", depth);
1181
1182                 /* ocfs2_shift_tree_depth will return us a buffer with
1183                  * the new extent block (so we can pass that to
1184                  * ocfs2_add_branch). */
1185                 ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
1186                                              meta_ac, &bh);
1187                 if (ret < 0) {
1188                         mlog_errno(ret);
1189                         goto out;
1190                 }
1191                 depth++;
1192                 if (depth == 1) {
1193                         /*
1194                          * Special case: we have room now if we shifted from
1195                          * tree_depth 0, so no more work needs to be done.
1196                          *
1197                          * We won't be calling add_branch, so pass
1198                          * back *last_eb_bh as the new leaf. At depth
1199                          * zero, it should always be null so there's
1200                          * no reason to brelse.
1201                          */
1202                         BUG_ON(*last_eb_bh);
1203                         get_bh(bh);
1204                         *last_eb_bh = bh;
1205                         goto out;
1206                 }
1207         }
1208
1209         /* call ocfs2_add_branch to add the final part of the tree with
1210          * the new data. */
1211         mlog(0, "add branch. bh = %p\n", bh);
1212         ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
1213                                meta_ac);
1214         if (ret < 0) {
1215                 mlog_errno(ret);
1216                 goto out;
1217         }
1218
1219 out:
1220         if (final_depth)
1221                 *final_depth = depth;
1222         brelse(bh);
1223         return ret;
1224 }
1225
1226 /*
1227  * This function will discard the rightmost extent record.
1228  */
1229 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1230 {
1231         int next_free = le16_to_cpu(el->l_next_free_rec);
1232         int count = le16_to_cpu(el->l_count);
1233         unsigned int num_bytes;
1234
1235         BUG_ON(!next_free);
1236         /* This will cause us to go off the end of our extent list. */
1237         BUG_ON(next_free >= count);
1238
1239         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1240
1241         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1242 }
1243
1244 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1245                               struct ocfs2_extent_rec *insert_rec)
1246 {
1247         int i, insert_index, next_free, has_empty, num_bytes;
1248         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1249         struct ocfs2_extent_rec *rec;
1250
1251         next_free = le16_to_cpu(el->l_next_free_rec);
1252         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1253
1254         BUG_ON(!next_free);
1255
1256         /* The tree code before us didn't allow enough room in the leaf. */
1257         BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1258
1259         /*
1260          * The easiest way to approach this is to just remove the
1261          * empty extent and temporarily decrement next_free.
1262          */
1263         if (has_empty) {
1264                 /*
1265                  * If next_free was 1 (only an empty extent), this
1266                  * loop won't execute, which is fine. We still want
1267                  * the decrement above to happen.
1268                  */
1269                 for(i = 0; i < (next_free - 1); i++)
1270                         el->l_recs[i] = el->l_recs[i+1];
1271
1272                 next_free--;
1273         }
1274
1275         /*
1276          * Figure out what the new record index should be.
1277          */
1278         for(i = 0; i < next_free; i++) {
1279                 rec = &el->l_recs[i];
1280
1281                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
1282                         break;
1283         }
1284         insert_index = i;
1285
1286         mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1287              insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1288
1289         BUG_ON(insert_index < 0);
1290         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1291         BUG_ON(insert_index > next_free);
1292
1293         /*
1294          * No need to memmove if we're just adding to the tail.
1295          */
1296         if (insert_index != next_free) {
1297                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
1298
1299                 num_bytes = next_free - insert_index;
1300                 num_bytes *= sizeof(struct ocfs2_extent_rec);
1301                 memmove(&el->l_recs[insert_index + 1],
1302                         &el->l_recs[insert_index],
1303                         num_bytes);
1304         }
1305
1306         /*
1307          * Either we had an empty extent, and need to re-increment or
1308          * there was no empty extent on a non full rightmost leaf node,
1309          * in which case we still need to increment.
1310          */
1311         next_free++;
1312         el->l_next_free_rec = cpu_to_le16(next_free);
1313         /*
1314          * Make sure none of the math above just messed up our tree.
1315          */
1316         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1317
1318         el->l_recs[insert_index] = *insert_rec;
1319
1320 }
1321
1322 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1323 {
1324         int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1325
1326         BUG_ON(num_recs == 0);
1327
1328         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1329                 num_recs--;
1330                 size = num_recs * sizeof(struct ocfs2_extent_rec);
1331                 memmove(&el->l_recs[0], &el->l_recs[1], size);
1332                 memset(&el->l_recs[num_recs], 0,
1333                        sizeof(struct ocfs2_extent_rec));
1334                 el->l_next_free_rec = cpu_to_le16(num_recs);
1335         }
1336 }
1337
1338 /*
1339  * Create an empty extent record .
1340  *
1341  * l_next_free_rec may be updated.
1342  *
1343  * If an empty extent already exists do nothing.
1344  */
1345 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1346 {
1347         int next_free = le16_to_cpu(el->l_next_free_rec);
1348
1349         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1350
1351         if (next_free == 0)
1352                 goto set_and_inc;
1353
1354         if (ocfs2_is_empty_extent(&el->l_recs[0]))
1355                 return;
1356
1357         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1358                         "Asked to create an empty extent in a full list:\n"
1359                         "count = %u, tree depth = %u",
1360                         le16_to_cpu(el->l_count),
1361                         le16_to_cpu(el->l_tree_depth));
1362
1363         ocfs2_shift_records_right(el);
1364
1365 set_and_inc:
1366         le16_add_cpu(&el->l_next_free_rec, 1);
1367         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1368 }
1369
1370 /*
1371  * For a rotation which involves two leaf nodes, the "root node" is
1372  * the lowest level tree node which contains a path to both leafs. This
1373  * resulting set of information can be used to form a complete "subtree"
1374  *
1375  * This function is passed two full paths from the dinode down to a
1376  * pair of adjacent leaves. It's task is to figure out which path
1377  * index contains the subtree root - this can be the root index itself
1378  * in a worst-case rotation.
1379  *
1380  * The array index of the subtree root is passed back.
1381  */
1382 static int ocfs2_find_subtree_root(struct inode *inode,
1383                                    struct ocfs2_path *left,
1384                                    struct ocfs2_path *right)
1385 {
1386         int i = 0;
1387
1388         /*
1389          * Check that the caller passed in two paths from the same tree.
1390          */
1391         BUG_ON(path_root_bh(left) != path_root_bh(right));
1392
1393         do {
1394                 i++;
1395
1396                 /*
1397                  * The caller didn't pass two adjacent paths.
1398                  */
1399                 mlog_bug_on_msg(i > left->p_tree_depth,
1400                                 "Inode %lu, left depth %u, right depth %u\n"
1401                                 "left leaf blk %llu, right leaf blk %llu\n",
1402                                 inode->i_ino, left->p_tree_depth,
1403                                 right->p_tree_depth,
1404                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1405                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1406         } while (left->p_node[i].bh->b_blocknr ==
1407                  right->p_node[i].bh->b_blocknr);
1408
1409         return i - 1;
1410 }
1411
1412 typedef void (path_insert_t)(void *, struct buffer_head *);
1413
1414 /*
1415  * Traverse a btree path in search of cpos, starting at root_el.
1416  *
1417  * This code can be called with a cpos larger than the tree, in which
1418  * case it will return the rightmost path.
1419  */
1420 static int __ocfs2_find_path(struct inode *inode,
1421                              struct ocfs2_extent_list *root_el, u32 cpos,
1422                              path_insert_t *func, void *data)
1423 {
1424         int i, ret = 0;
1425         u32 range;
1426         u64 blkno;
1427         struct buffer_head *bh = NULL;
1428         struct ocfs2_extent_block *eb;
1429         struct ocfs2_extent_list *el;
1430         struct ocfs2_extent_rec *rec;
1431         struct ocfs2_inode_info *oi = OCFS2_I(inode);
1432
1433         el = root_el;
1434         while (el->l_tree_depth) {
1435                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1436                         ocfs2_error(inode->i_sb,
1437                                     "Inode %llu has empty extent list at "
1438                                     "depth %u\n",
1439                                     (unsigned long long)oi->ip_blkno,
1440                                     le16_to_cpu(el->l_tree_depth));
1441                         ret = -EROFS;
1442                         goto out;
1443
1444                 }
1445
1446                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1447                         rec = &el->l_recs[i];
1448
1449                         /*
1450                          * In the case that cpos is off the allocation
1451                          * tree, this should just wind up returning the
1452                          * rightmost record.
1453                          */
1454                         range = le32_to_cpu(rec->e_cpos) +
1455                                 ocfs2_rec_clusters(el, rec);
1456                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1457                             break;
1458                 }
1459
1460                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1461                 if (blkno == 0) {
1462                         ocfs2_error(inode->i_sb,
1463                                     "Inode %llu has bad blkno in extent list "
1464                                     "at depth %u (index %d)\n",
1465                                     (unsigned long long)oi->ip_blkno,
1466                                     le16_to_cpu(el->l_tree_depth), i);
1467                         ret = -EROFS;
1468                         goto out;
1469                 }
1470
1471                 brelse(bh);
1472                 bh = NULL;
1473                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1474                                        &bh, OCFS2_BH_CACHED, inode);
1475                 if (ret) {
1476                         mlog_errno(ret);
1477                         goto out;
1478                 }
1479
1480                 eb = (struct ocfs2_extent_block *) bh->b_data;
1481                 el = &eb->h_list;
1482                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1483                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1484                         ret = -EIO;
1485                         goto out;
1486                 }
1487
1488                 if (le16_to_cpu(el->l_next_free_rec) >
1489                     le16_to_cpu(el->l_count)) {
1490                         ocfs2_error(inode->i_sb,
1491                                     "Inode %llu has bad count in extent list "
1492                                     "at block %llu (next free=%u, count=%u)\n",
1493                                     (unsigned long long)oi->ip_blkno,
1494                                     (unsigned long long)bh->b_blocknr,
1495                                     le16_to_cpu(el->l_next_free_rec),
1496                                     le16_to_cpu(el->l_count));
1497                         ret = -EROFS;
1498                         goto out;
1499                 }
1500
1501                 if (func)
1502                         func(data, bh);
1503         }
1504
1505 out:
1506         /*
1507          * Catch any trailing bh that the loop didn't handle.
1508          */
1509         brelse(bh);
1510
1511         return ret;
1512 }
1513
1514 /*
1515  * Given an initialized path (that is, it has a valid root extent
1516  * list), this function will traverse the btree in search of the path
1517  * which would contain cpos.
1518  *
1519  * The path traveled is recorded in the path structure.
1520  *
1521  * Note that this will not do any comparisons on leaf node extent
1522  * records, so it will work fine in the case that we just added a tree
1523  * branch.
1524  */
1525 struct find_path_data {
1526         int index;
1527         struct ocfs2_path *path;
1528 };
1529 static void find_path_ins(void *data, struct buffer_head *bh)
1530 {
1531         struct find_path_data *fp = data;
1532
1533         get_bh(bh);
1534         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1535         fp->index++;
1536 }
1537 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1538                            u32 cpos)
1539 {
1540         struct find_path_data data;
1541
1542         data.index = 1;
1543         data.path = path;
1544         return __ocfs2_find_path(inode, path_root_el(path), cpos,
1545                                  find_path_ins, &data);
1546 }
1547
1548 static void find_leaf_ins(void *data, struct buffer_head *bh)
1549 {
1550         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1551         struct ocfs2_extent_list *el = &eb->h_list;
1552         struct buffer_head **ret = data;
1553
1554         /* We want to retain only the leaf block. */
1555         if (le16_to_cpu(el->l_tree_depth) == 0) {
1556                 get_bh(bh);
1557                 *ret = bh;
1558         }
1559 }
1560 /*
1561  * Find the leaf block in the tree which would contain cpos. No
1562  * checking of the actual leaf is done.
1563  *
1564  * Some paths want to call this instead of allocating a path structure
1565  * and calling ocfs2_find_path().
1566  *
1567  * This function doesn't handle non btree extent lists.
1568  */
1569 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1570                     u32 cpos, struct buffer_head **leaf_bh)
1571 {
1572         int ret;
1573         struct buffer_head *bh = NULL;
1574
1575         ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1576         if (ret) {
1577                 mlog_errno(ret);
1578                 goto out;
1579         }
1580
1581         *leaf_bh = bh;
1582 out:
1583         return ret;
1584 }
1585
1586 /*
1587  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1588  *
1589  * Basically, we've moved stuff around at the bottom of the tree and
1590  * we need to fix up the extent records above the changes to reflect
1591  * the new changes.
1592  *
1593  * left_rec: the record on the left.
1594  * left_child_el: is the child list pointed to by left_rec
1595  * right_rec: the record to the right of left_rec
1596  * right_child_el: is the child list pointed to by right_rec
1597  *
1598  * By definition, this only works on interior nodes.
1599  */
1600 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1601                                   struct ocfs2_extent_list *left_child_el,
1602                                   struct ocfs2_extent_rec *right_rec,
1603                                   struct ocfs2_extent_list *right_child_el)
1604 {
1605         u32 left_clusters, right_end;
1606
1607         /*
1608          * Interior nodes never have holes. Their cpos is the cpos of
1609          * the leftmost record in their child list. Their cluster
1610          * count covers the full theoretical range of their child list
1611          * - the range between their cpos and the cpos of the record
1612          * immediately to their right.
1613          */
1614         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1615         if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1616                 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1617                 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1618         }
1619         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1620         left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1621
1622         /*
1623          * Calculate the rightmost cluster count boundary before
1624          * moving cpos - we will need to adjust clusters after
1625          * updating e_cpos to keep the same highest cluster count.
1626          */
1627         right_end = le32_to_cpu(right_rec->e_cpos);
1628         right_end += le32_to_cpu(right_rec->e_int_clusters);
1629
1630         right_rec->e_cpos = left_rec->e_cpos;
1631         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1632
1633         right_end -= le32_to_cpu(right_rec->e_cpos);
1634         right_rec->e_int_clusters = cpu_to_le32(right_end);
1635 }
1636
1637 /*
1638  * Adjust the adjacent root node records involved in a
1639  * rotation. left_el_blkno is passed in as a key so that we can easily
1640  * find it's index in the root list.
1641  */
1642 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1643                                       struct ocfs2_extent_list *left_el,
1644                                       struct ocfs2_extent_list *right_el,
1645                                       u64 left_el_blkno)
1646 {
1647         int i;
1648
1649         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1650                le16_to_cpu(left_el->l_tree_depth));
1651
1652         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1653                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1654                         break;
1655         }
1656
1657         /*
1658          * The path walking code should have never returned a root and
1659          * two paths which are not adjacent.
1660          */
1661         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1662
1663         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1664                                       &root_el->l_recs[i + 1], right_el);
1665 }
1666
1667 /*
1668  * We've changed a leaf block (in right_path) and need to reflect that
1669  * change back up the subtree.
1670  *
1671  * This happens in multiple places:
1672  *   - When we've moved an extent record from the left path leaf to the right
1673  *     path leaf to make room for an empty extent in the left path leaf.
1674  *   - When our insert into the right path leaf is at the leftmost edge
1675  *     and requires an update of the path immediately to it's left. This
1676  *     can occur at the end of some types of rotation and appending inserts.
1677  *   - When we've adjusted the last extent record in the left path leaf and the
1678  *     1st extent record in the right path leaf during cross extent block merge.
1679  */
1680 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1681                                        struct ocfs2_path *left_path,
1682                                        struct ocfs2_path *right_path,
1683                                        int subtree_index)
1684 {
1685         int ret, i, idx;
1686         struct ocfs2_extent_list *el, *left_el, *right_el;
1687         struct ocfs2_extent_rec *left_rec, *right_rec;
1688         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1689
1690         /*
1691          * Update the counts and position values within all the
1692          * interior nodes to reflect the leaf rotation we just did.
1693          *
1694          * The root node is handled below the loop.
1695          *
1696          * We begin the loop with right_el and left_el pointing to the
1697          * leaf lists and work our way up.
1698          *
1699          * NOTE: within this loop, left_el and right_el always refer
1700          * to the *child* lists.
1701          */
1702         left_el = path_leaf_el(left_path);
1703         right_el = path_leaf_el(right_path);
1704         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1705                 mlog(0, "Adjust records at index %u\n", i);
1706
1707                 /*
1708                  * One nice property of knowing that all of these
1709                  * nodes are below the root is that we only deal with
1710                  * the leftmost right node record and the rightmost
1711                  * left node record.
1712                  */
1713                 el = left_path->p_node[i].el;
1714                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1715                 left_rec = &el->l_recs[idx];
1716
1717                 el = right_path->p_node[i].el;
1718                 right_rec = &el->l_recs[0];
1719
1720                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1721                                               right_el);
1722
1723                 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1724                 if (ret)
1725                         mlog_errno(ret);
1726
1727                 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1728                 if (ret)
1729                         mlog_errno(ret);
1730
1731                 /*
1732                  * Setup our list pointers now so that the current
1733                  * parents become children in the next iteration.
1734                  */
1735                 left_el = left_path->p_node[i].el;
1736                 right_el = right_path->p_node[i].el;
1737         }
1738
1739         /*
1740          * At the root node, adjust the two adjacent records which
1741          * begin our path to the leaves.
1742          */
1743
1744         el = left_path->p_node[subtree_index].el;
1745         left_el = left_path->p_node[subtree_index + 1].el;
1746         right_el = right_path->p_node[subtree_index + 1].el;
1747
1748         ocfs2_adjust_root_records(el, left_el, right_el,
1749                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
1750
1751         root_bh = left_path->p_node[subtree_index].bh;
1752
1753         ret = ocfs2_journal_dirty(handle, root_bh);
1754         if (ret)
1755                 mlog_errno(ret);
1756 }
1757
1758 static int ocfs2_rotate_subtree_right(struct inode *inode,
1759                                       handle_t *handle,
1760                                       struct ocfs2_path *left_path,
1761                                       struct ocfs2_path *right_path,
1762                                       int subtree_index)
1763 {
1764         int ret, i;
1765         struct buffer_head *right_leaf_bh;
1766         struct buffer_head *left_leaf_bh = NULL;
1767         struct buffer_head *root_bh;
1768         struct ocfs2_extent_list *right_el, *left_el;
1769         struct ocfs2_extent_rec move_rec;
1770
1771         left_leaf_bh = path_leaf_bh(left_path);
1772         left_el = path_leaf_el(left_path);
1773
1774         if (left_el->l_next_free_rec != left_el->l_count) {
1775                 ocfs2_error(inode->i_sb,
1776                             "Inode %llu has non-full interior leaf node %llu"
1777                             "(next free = %u)",
1778                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1779                             (unsigned long long)left_leaf_bh->b_blocknr,
1780                             le16_to_cpu(left_el->l_next_free_rec));
1781                 return -EROFS;
1782         }
1783
1784         /*
1785          * This extent block may already have an empty record, so we
1786          * return early if so.
1787          */
1788         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1789                 return 0;
1790
1791         root_bh = left_path->p_node[subtree_index].bh;
1792         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1793
1794         ret = ocfs2_journal_access(handle, inode, root_bh,
1795                                    OCFS2_JOURNAL_ACCESS_WRITE);
1796         if (ret) {
1797                 mlog_errno(ret);
1798                 goto out;
1799         }
1800
1801         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1802                 ret = ocfs2_journal_access(handle, inode,
1803                                            right_path->p_node[i].bh,
1804                                            OCFS2_JOURNAL_ACCESS_WRITE);
1805                 if (ret) {
1806                         mlog_errno(ret);
1807                         goto out;
1808                 }
1809
1810                 ret = ocfs2_journal_access(handle, inode,
1811                                            left_path->p_node[i].bh,
1812                                            OCFS2_JOURNAL_ACCESS_WRITE);
1813                 if (ret) {
1814                         mlog_errno(ret);
1815                         goto out;
1816                 }
1817         }
1818
1819         right_leaf_bh = path_leaf_bh(right_path);
1820         right_el = path_leaf_el(right_path);
1821
1822         /* This is a code error, not a disk corruption. */
1823         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1824                         "because rightmost leaf block %llu is empty\n",
1825                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1826                         (unsigned long long)right_leaf_bh->b_blocknr);
1827
1828         ocfs2_create_empty_extent(right_el);
1829
1830         ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1831         if (ret) {
1832                 mlog_errno(ret);
1833                 goto out;
1834         }
1835
1836         /* Do the copy now. */
1837         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1838         move_rec = left_el->l_recs[i];
1839         right_el->l_recs[0] = move_rec;
1840
1841         /*
1842          * Clear out the record we just copied and shift everything
1843          * over, leaving an empty extent in the left leaf.
1844          *
1845          * We temporarily subtract from next_free_rec so that the
1846          * shift will lose the tail record (which is now defunct).
1847          */
1848         le16_add_cpu(&left_el->l_next_free_rec, -1);
1849         ocfs2_shift_records_right(left_el);
1850         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1851         le16_add_cpu(&left_el->l_next_free_rec, 1);
1852
1853         ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1854         if (ret) {
1855                 mlog_errno(ret);
1856                 goto out;
1857         }
1858
1859         ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1860                                 subtree_index);
1861
1862 out:
1863         return ret;
1864 }
1865
1866 /*
1867  * Given a full path, determine what cpos value would return us a path
1868  * containing the leaf immediately to the left of the current one.
1869  *
1870  * Will return zero if the path passed in is already the leftmost path.
1871  */
1872 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1873                                          struct ocfs2_path *path, u32 *cpos)
1874 {
1875         int i, j, ret = 0;
1876         u64 blkno;
1877         struct ocfs2_extent_list *el;
1878
1879         BUG_ON(path->p_tree_depth == 0);
1880
1881         *cpos = 0;
1882
1883         blkno = path_leaf_bh(path)->b_blocknr;
1884
1885         /* Start at the tree node just above the leaf and work our way up. */
1886         i = path->p_tree_depth - 1;
1887         while (i >= 0) {
1888                 el = path->p_node[i].el;
1889
1890                 /*
1891                  * Find the extent record just before the one in our
1892                  * path.
1893                  */
1894                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1895                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1896                                 if (j == 0) {
1897                                         if (i == 0) {
1898                                                 /*
1899                                                  * We've determined that the
1900                                                  * path specified is already
1901                                                  * the leftmost one - return a
1902                                                  * cpos of zero.
1903                                                  */
1904                                                 goto out;
1905                                         }
1906                                         /*
1907                                          * The leftmost record points to our
1908                                          * leaf - we need to travel up the
1909                                          * tree one level.
1910                                          */
1911                                         goto next_node;
1912                                 }
1913
1914                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1915                                 *cpos = *cpos + ocfs2_rec_clusters(el,
1916                                                            &el->l_recs[j - 1]);
1917                                 *cpos = *cpos - 1;
1918                                 goto out;
1919                         }
1920                 }
1921
1922                 /*
1923                  * If we got here, we never found a valid node where
1924                  * the tree indicated one should be.
1925                  */
1926                 ocfs2_error(sb,
1927                             "Invalid extent tree at extent block %llu\n",
1928                             (unsigned long long)blkno);
1929                 ret = -EROFS;
1930                 goto out;
1931
1932 next_node:
1933                 blkno = path->p_node[i].bh->b_blocknr;
1934                 i--;
1935         }
1936
1937 out:
1938         return ret;
1939 }
1940
1941 /*
1942  * Extend the transaction by enough credits to complete the rotation,
1943  * and still leave at least the original number of credits allocated
1944  * to this transaction.
1945  */
1946 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1947                                            int op_credits,
1948                                            struct ocfs2_path *path)
1949 {
1950         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
1951
1952         if (handle->h_buffer_credits < credits)
1953                 return ocfs2_extend_trans(handle, credits);
1954
1955         return 0;
1956 }
1957
1958 /*
1959  * Trap the case where we're inserting into the theoretical range past
1960  * the _actual_ left leaf range. Otherwise, we'll rotate a record
1961  * whose cpos is less than ours into the right leaf.
1962  *
1963  * It's only necessary to look at the rightmost record of the left
1964  * leaf because the logic that calls us should ensure that the
1965  * theoretical ranges in the path components above the leaves are
1966  * correct.
1967  */
1968 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1969                                                  u32 insert_cpos)
1970 {
1971         struct ocfs2_extent_list *left_el;
1972         struct ocfs2_extent_rec *rec;
1973         int next_free;
1974
1975         left_el = path_leaf_el(left_path);
1976         next_free = le16_to_cpu(left_el->l_next_free_rec);
1977         rec = &left_el->l_recs[next_free - 1];
1978
1979         if (insert_cpos > le32_to_cpu(rec->e_cpos))
1980                 return 1;
1981         return 0;
1982 }
1983
1984 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
1985 {
1986         int next_free = le16_to_cpu(el->l_next_free_rec);
1987         unsigned int range;
1988         struct ocfs2_extent_rec *rec;
1989
1990         if (next_free == 0)
1991                 return 0;
1992
1993         rec = &el->l_recs[0];
1994         if (ocfs2_is_empty_extent(rec)) {
1995                 /* Empty list. */
1996                 if (next_free == 1)
1997                         return 0;
1998                 rec = &el->l_recs[1];
1999         }
2000
2001         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2002         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2003                 return 1;
2004         return 0;
2005 }
2006
2007 /*
2008  * Rotate all the records in a btree right one record, starting at insert_cpos.
2009  *
2010  * The path to the rightmost leaf should be passed in.
2011  *
2012  * The array is assumed to be large enough to hold an entire path (tree depth).
2013  *
2014  * Upon succesful return from this function:
2015  *
2016  * - The 'right_path' array will contain a path to the leaf block
2017  *   whose range contains e_cpos.
2018  * - That leaf block will have a single empty extent in list index 0.
2019  * - In the case that the rotation requires a post-insert update,
2020  *   *ret_left_path will contain a valid path which can be passed to
2021  *   ocfs2_insert_path().
2022  */
2023 static int ocfs2_rotate_tree_right(struct inode *inode,
2024                                    handle_t *handle,
2025                                    enum ocfs2_split_type split,
2026                                    u32 insert_cpos,
2027                                    struct ocfs2_path *right_path,
2028                                    struct ocfs2_path **ret_left_path)
2029 {
2030         int ret, start, orig_credits = handle->h_buffer_credits;
2031         u32 cpos;
2032         struct ocfs2_path *left_path = NULL;
2033
2034         *ret_left_path = NULL;
2035
2036         left_path = ocfs2_new_path(path_root_bh(right_path),
2037                                    path_root_el(right_path));
2038         if (!left_path) {
2039                 ret = -ENOMEM;
2040                 mlog_errno(ret);
2041                 goto out;
2042         }
2043
2044         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
2045         if (ret) {
2046                 mlog_errno(ret);
2047                 goto out;
2048         }
2049
2050         mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
2051
2052         /*
2053          * What we want to do here is:
2054          *
2055          * 1) Start with the rightmost path.
2056          *
2057          * 2) Determine a path to the leaf block directly to the left
2058          *    of that leaf.
2059          *
2060          * 3) Determine the 'subtree root' - the lowest level tree node
2061          *    which contains a path to both leaves.
2062          *
2063          * 4) Rotate the subtree.
2064          *
2065          * 5) Find the next subtree by considering the left path to be
2066          *    the new right path.
2067          *
2068          * The check at the top of this while loop also accepts
2069          * insert_cpos == cpos because cpos is only a _theoretical_
2070          * value to get us the left path - insert_cpos might very well
2071          * be filling that hole.
2072          *
2073          * Stop at a cpos of '0' because we either started at the
2074          * leftmost branch (i.e., a tree with one branch and a
2075          * rotation inside of it), or we've gone as far as we can in
2076          * rotating subtrees.
2077          */
2078         while (cpos && insert_cpos <= cpos) {
2079                 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2080                      insert_cpos, cpos);
2081
2082                 ret = ocfs2_find_path(inode, left_path, cpos);
2083                 if (ret) {
2084                         mlog_errno(ret);
2085                         goto out;
2086                 }
2087
2088                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
2089                                 path_leaf_bh(right_path),
2090                                 "Inode %lu: error during insert of %u "
2091                                 "(left path cpos %u) results in two identical "
2092                                 "paths ending at %llu\n",
2093                                 inode->i_ino, insert_cpos, cpos,
2094                                 (unsigned long long)
2095                                 path_leaf_bh(left_path)->b_blocknr);
2096
2097                 if (split == SPLIT_NONE &&
2098                     ocfs2_rotate_requires_path_adjustment(left_path,
2099                                                           insert_cpos)) {
2100
2101                         /*
2102                          * We've rotated the tree as much as we
2103                          * should. The rest is up to
2104                          * ocfs2_insert_path() to complete, after the
2105                          * record insertion. We indicate this
2106                          * situation by returning the left path.
2107                          *
2108                          * The reason we don't adjust the records here
2109                          * before the record insert is that an error
2110                          * later might break the rule where a parent
2111                          * record e_cpos will reflect the actual
2112                          * e_cpos of the 1st nonempty record of the
2113                          * child list.
2114                          */
2115                         *ret_left_path = left_path;
2116                         goto out_ret_path;
2117                 }
2118
2119                 start = ocfs2_find_subtree_root(inode, left_path, right_path);
2120
2121                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2122                      start,
2123                      (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2124                      right_path->p_tree_depth);
2125
2126                 ret = ocfs2_extend_rotate_transaction(handle, start,
2127                                                       orig_credits, right_path);
2128                 if (ret) {
2129                         mlog_errno(ret);
2130                         goto out;
2131                 }
2132
2133                 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
2134                                                  right_path, start);
2135                 if (ret) {
2136                         mlog_errno(ret);
2137                         goto out;
2138                 }
2139
2140                 if (split != SPLIT_NONE &&
2141                     ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2142                                                 insert_cpos)) {
2143                         /*
2144                          * A rotate moves the rightmost left leaf
2145                          * record over to the leftmost right leaf
2146                          * slot. If we're doing an extent split
2147                          * instead of a real insert, then we have to
2148                          * check that the extent to be split wasn't
2149                          * just moved over. If it was, then we can
2150                          * exit here, passing left_path back -
2151                          * ocfs2_split_extent() is smart enough to
2152                          * search both leaves.
2153                          */
2154                         *ret_left_path = left_path;
2155                         goto out_ret_path;
2156                 }
2157
2158                 /*
2159                  * There is no need to re-read the next right path
2160                  * as we know that it'll be our current left
2161                  * path. Optimize by copying values instead.
2162                  */
2163                 ocfs2_mv_path(right_path, left_path);
2164
2165                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
2166                                                     &cpos);
2167                 if (ret) {
2168                         mlog_errno(ret);
2169                         goto out;
2170                 }
2171         }
2172
2173 out:
2174         ocfs2_free_path(left_path);
2175
2176 out_ret_path:
2177         return ret;
2178 }
2179
2180 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2181                                       struct ocfs2_path *path)
2182 {
2183         int i, idx;
2184         struct ocfs2_extent_rec *rec;
2185         struct ocfs2_extent_list *el;
2186         struct ocfs2_extent_block *eb;
2187         u32 range;
2188
2189         /* Path should always be rightmost. */
2190         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2191         BUG_ON(eb->h_next_leaf_blk != 0ULL);
2192
2193         el = &eb->h_list;
2194         BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2195         idx = le16_to_cpu(el->l_next_free_rec) - 1;
2196         rec = &el->l_recs[idx];
2197         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2198
2199         for (i = 0; i < path->p_tree_depth; i++) {
2200                 el = path->p_node[i].el;
2201                 idx = le16_to_cpu(el->l_next_free_rec) - 1;
2202                 rec = &el->l_recs[idx];
2203
2204                 rec->e_int_clusters = cpu_to_le32(range);
2205                 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2206
2207                 ocfs2_journal_dirty(handle, path->p_node[i].bh);
2208         }
2209 }
2210
2211 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2212                               struct ocfs2_cached_dealloc_ctxt *dealloc,
2213                               struct ocfs2_path *path, int unlink_start)
2214 {
2215         int ret, i;
2216         struct ocfs2_extent_block *eb;
2217         struct ocfs2_extent_list *el;
2218         struct buffer_head *bh;
2219
2220         for(i = unlink_start; i < path_num_items(path); i++) {
2221                 bh = path->p_node[i].bh;
2222
2223                 eb = (struct ocfs2_extent_block *)bh->b_data;
2224                 /*
2225                  * Not all nodes might have had their final count
2226                  * decremented by the caller - handle this here.
2227                  */
2228                 el = &eb->h_list;
2229                 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2230                         mlog(ML_ERROR,
2231                              "Inode %llu, attempted to remove extent block "
2232                              "%llu with %u records\n",
2233                              (unsigned long long)OCFS2_I(inode)->ip_blkno,
2234                              (unsigned long long)le64_to_cpu(eb->h_blkno),
2235                              le16_to_cpu(el->l_next_free_rec));
2236
2237                         ocfs2_journal_dirty(handle, bh);
2238                         ocfs2_remove_from_cache(inode, bh);
2239                         continue;
2240                 }
2241
2242                 el->l_next_free_rec = 0;
2243                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2244
2245                 ocfs2_journal_dirty(handle, bh);
2246
2247                 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2248                 if (ret)
2249                         mlog_errno(ret);
2250
2251                 ocfs2_remove_from_cache(inode, bh);
2252         }
2253 }
2254
2255 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2256                                  struct ocfs2_path *left_path,
2257                                  struct ocfs2_path *right_path,
2258                                  int subtree_index,
2259                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
2260 {
2261         int i;
2262         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2263         struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2264         struct ocfs2_extent_list *el;
2265         struct ocfs2_extent_block *eb;
2266
2267         el = path_leaf_el(left_path);
2268
2269         eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2270
2271         for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2272                 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2273                         break;
2274
2275         BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2276
2277         memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2278         le16_add_cpu(&root_el->l_next_free_rec, -1);
2279
2280         eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2281         eb->h_next_leaf_blk = 0;
2282
2283         ocfs2_journal_dirty(handle, root_bh);
2284         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2285
2286         ocfs2_unlink_path(inode, handle, dealloc, right_path,
2287                           subtree_index + 1);
2288 }
2289
2290 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2291                                      struct ocfs2_path *left_path,
2292                                      struct ocfs2_path *right_path,
2293                                      int subtree_index,
2294                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
2295                                      int *deleted,
2296                                      struct ocfs2_extent_tree *et)
2297 {
2298         int ret, i, del_right_subtree = 0, right_has_empty = 0;
2299         struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2300         struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2301         struct ocfs2_extent_block *eb;
2302
2303         *deleted = 0;
2304
2305         right_leaf_el = path_leaf_el(right_path);
2306         left_leaf_el = path_leaf_el(left_path);
2307         root_bh = left_path->p_node[subtree_index].bh;
2308         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2309
2310         if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2311                 return 0;
2312
2313         eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2314         if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2315                 /*
2316                  * It's legal for us to proceed if the right leaf is
2317                  * the rightmost one and it has an empty extent. There
2318                  * are two cases to handle - whether the leaf will be
2319                  * empty after removal or not. If the leaf isn't empty
2320                  * then just remove the empty extent up front. The
2321                  * next block will handle empty leaves by flagging
2322                  * them for unlink.
2323                  *
2324                  * Non rightmost leaves will throw -EAGAIN and the
2325                  * caller can manually move the subtree and retry.
2326                  */
2327
2328                 if (eb->h_next_leaf_blk != 0ULL)
2329                         return -EAGAIN;
2330
2331                 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2332                         ret = ocfs2_journal_access(handle, inode,
2333                                                    path_leaf_bh(right_path),
2334                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2335                         if (ret) {
2336                                 mlog_errno(ret);
2337                                 goto out;
2338                         }
2339
2340                         ocfs2_remove_empty_extent(right_leaf_el);
2341                 } else
2342                         right_has_empty = 1;
2343         }
2344
2345         if (eb->h_next_leaf_blk == 0ULL &&
2346             le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2347                 /*
2348                  * We have to update i_last_eb_blk during the meta
2349                  * data delete.
2350                  */
2351                 ret = ocfs2_journal_access(handle, inode, et_root_bh,
2352                                            OCFS2_JOURNAL_ACCESS_WRITE);
2353                 if (ret) {
2354                         mlog_errno(ret);
2355                         goto out;
2356                 }
2357
2358                 del_right_subtree = 1;
2359         }
2360
2361         /*
2362          * Getting here with an empty extent in the right path implies
2363          * that it's the rightmost path and will be deleted.
2364          */
2365         BUG_ON(right_has_empty && !del_right_subtree);
2366
2367         ret = ocfs2_journal_access(handle, inode, root_bh,
2368                                    OCFS2_JOURNAL_ACCESS_WRITE);
2369         if (ret) {
2370                 mlog_errno(ret);
2371                 goto out;
2372         }
2373
2374         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2375                 ret = ocfs2_journal_access(handle, inode,
2376                                            right_path->p_node[i].bh,
2377                                            OCFS2_JOURNAL_ACCESS_WRITE);
2378                 if (ret) {
2379                         mlog_errno(ret);
2380                         goto out;
2381                 }
2382
2383                 ret = ocfs2_journal_access(handle, inode,
2384                                            left_path->p_node[i].bh,
2385                                            OCFS2_JOURNAL_ACCESS_WRITE);
2386                 if (ret) {
2387                         mlog_errno(ret);
2388                         goto out;
2389                 }
2390         }
2391
2392         if (!right_has_empty) {
2393                 /*
2394                  * Only do this if we're moving a real
2395                  * record. Otherwise, the action is delayed until
2396                  * after removal of the right path in which case we
2397                  * can do a simple shift to remove the empty extent.
2398                  */
2399                 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2400                 memset(&right_leaf_el->l_recs[0], 0,
2401                        sizeof(struct ocfs2_extent_rec));
2402         }
2403         if (eb->h_next_leaf_blk == 0ULL) {
2404                 /*
2405                  * Move recs over to get rid of empty extent, decrease
2406                  * next_free. This is allowed to remove the last
2407                  * extent in our leaf (setting l_next_free_rec to
2408                  * zero) - the delete code below won't care.
2409                  */
2410                 ocfs2_remove_empty_extent(right_leaf_el);
2411         }
2412
2413         ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2414         if (ret)
2415                 mlog_errno(ret);
2416         ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2417         if (ret)
2418                 mlog_errno(ret);
2419
2420         if (del_right_subtree) {
2421                 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2422                                      subtree_index, dealloc);
2423                 ocfs2_update_edge_lengths(inode, handle, left_path);
2424
2425                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2426                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2427
2428                 /*
2429                  * Removal of the extent in the left leaf was skipped
2430                  * above so we could delete the right path
2431                  * 1st.
2432                  */
2433                 if (right_has_empty)
2434                         ocfs2_remove_empty_extent(left_leaf_el);
2435
2436                 ret = ocfs2_journal_dirty(handle, et_root_bh);
2437                 if (ret)
2438                         mlog_errno(ret);
2439
2440                 *deleted = 1;
2441         } else
2442                 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2443                                            subtree_index);
2444
2445 out:
2446         return ret;
2447 }
2448
2449 /*
2450  * Given a full path, determine what cpos value would return us a path
2451  * containing the leaf immediately to the right of the current one.
2452  *
2453  * Will return zero if the path passed in is already the rightmost path.
2454  *
2455  * This looks similar, but is subtly different to
2456  * ocfs2_find_cpos_for_left_leaf().
2457  */
2458 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2459                                           struct ocfs2_path *path, u32 *cpos)
2460 {
2461         int i, j, ret = 0;
2462         u64 blkno;
2463         struct ocfs2_extent_list *el;
2464
2465         *cpos = 0;
2466
2467         if (path->p_tree_depth == 0)
2468                 return 0;
2469
2470         blkno = path_leaf_bh(path)->b_blocknr;
2471
2472         /* Start at the tree node just above the leaf and work our way up. */
2473         i = path->p_tree_depth - 1;
2474         while (i >= 0) {
2475                 int next_free;
2476
2477                 el = path->p_node[i].el;
2478
2479                 /*
2480                  * Find the extent record just after the one in our
2481                  * path.
2482                  */
2483                 next_free = le16_to_cpu(el->l_next_free_rec);
2484                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2485                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2486                                 if (j == (next_free - 1)) {
2487                                         if (i == 0) {
2488                                                 /*
2489                                                  * We've determined that the
2490                                                  * path specified is already
2491                                                  * the rightmost one - return a
2492                                                  * cpos of zero.
2493                                                  */
2494                                                 goto out;
2495                                         }
2496                                         /*
2497                                          * The rightmost record points to our
2498                                          * leaf - we need to travel up the
2499                                          * tree one level.
2500                                          */
2501                                         goto next_node;
2502                                 }
2503
2504                                 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2505                                 goto out;
2506                         }
2507                 }
2508
2509                 /*
2510                  * If we got here, we never found a valid node where
2511                  * the tree indicated one should be.
2512                  */
2513                 ocfs2_error(sb,
2514                             "Invalid extent tree at extent block %llu\n",
2515                             (unsigned long long)blkno);
2516                 ret = -EROFS;
2517                 goto out;
2518
2519 next_node:
2520                 blkno = path->p_node[i].bh->b_blocknr;
2521                 i--;
2522         }
2523
2524 out:
2525         return ret;
2526 }
2527
2528 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2529                                             handle_t *handle,
2530                                             struct buffer_head *bh,
2531                                             struct ocfs2_extent_list *el)
2532 {
2533         int ret;
2534
2535         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2536                 return 0;
2537
2538         ret = ocfs2_journal_access(handle, inode, bh,
2539                                    OCFS2_JOURNAL_ACCESS_WRITE);
2540         if (ret) {
2541                 mlog_errno(ret);
2542                 goto out;
2543         }
2544
2545         ocfs2_remove_empty_extent(el);
2546
2547         ret = ocfs2_journal_dirty(handle, bh);
2548         if (ret)
2549                 mlog_errno(ret);
2550
2551 out:
2552         return ret;
2553 }
2554
2555 static int __ocfs2_rotate_tree_left(struct inode *inode,
2556                                     handle_t *handle, int orig_credits,
2557                                     struct ocfs2_path *path,
2558                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2559                                     struct ocfs2_path **empty_extent_path,
2560                                     struct ocfs2_extent_tree *et)
2561 {
2562         int ret, subtree_root, deleted;
2563         u32 right_cpos;
2564         struct ocfs2_path *left_path = NULL;
2565         struct ocfs2_path *right_path = NULL;
2566
2567         BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2568
2569         *empty_extent_path = NULL;
2570
2571         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2572                                              &right_cpos);
2573         if (ret) {
2574                 mlog_errno(ret);
2575                 goto out;
2576         }
2577
2578         left_path = ocfs2_new_path(path_root_bh(path),
2579                                    path_root_el(path));
2580         if (!left_path) {
2581                 ret = -ENOMEM;
2582                 mlog_errno(ret);
2583                 goto out;
2584         }
2585
2586         ocfs2_cp_path(left_path, path);
2587
2588         right_path = ocfs2_new_path(path_root_bh(path),
2589                                     path_root_el(path));
2590         if (!right_path) {
2591                 ret = -ENOMEM;
2592                 mlog_errno(ret);
2593                 goto out;
2594         }
2595
2596         while (right_cpos) {
2597                 ret = ocfs2_find_path(inode, right_path, right_cpos);
2598                 if (ret) {
2599                         mlog_errno(ret);
2600                         goto out;
2601                 }
2602
2603                 subtree_root = ocfs2_find_subtree_root(inode, left_path,
2604                                                        right_path);
2605
2606                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2607                      subtree_root,
2608                      (unsigned long long)
2609                      right_path->p_node[subtree_root].bh->b_blocknr,
2610                      right_path->p_tree_depth);
2611
2612                 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2613                                                       orig_credits, left_path);
2614                 if (ret) {
2615                         mlog_errno(ret);
2616                         goto out;
2617                 }
2618
2619                 /*
2620                  * Caller might still want to make changes to the
2621                  * tree root, so re-add it to the journal here.
2622                  */
2623                 ret = ocfs2_journal_access(handle, inode,
2624                                            path_root_bh(left_path),
2625                                            OCFS2_JOURNAL_ACCESS_WRITE);
2626                 if (ret) {
2627                         mlog_errno(ret);
2628                         goto out;
2629                 }
2630
2631                 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2632                                                 right_path, subtree_root,
2633                                                 dealloc, &deleted, et);
2634                 if (ret == -EAGAIN) {
2635                         /*
2636                          * The rotation has to temporarily stop due to
2637                          * the right subtree having an empty
2638                          * extent. Pass it back to the caller for a
2639                          * fixup.
2640                          */
2641                         *empty_extent_path = right_path;
2642                         right_path = NULL;
2643                         goto out;
2644                 }
2645                 if (ret) {
2646                         mlog_errno(ret);
2647                         goto out;
2648                 }
2649
2650                 /*
2651                  * The subtree rotate might have removed records on
2652                  * the rightmost edge. If so, then rotation is
2653                  * complete.
2654                  */
2655                 if (deleted)
2656                         break;
2657
2658                 ocfs2_mv_path(left_path, right_path);
2659
2660                 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2661                                                      &right_cpos);
2662                 if (ret) {
2663                         mlog_errno(ret);
2664                         goto out;
2665                 }
2666         }
2667
2668 out:
2669         ocfs2_free_path(right_path);
2670         ocfs2_free_path(left_path);
2671
2672         return ret;
2673 }
2674
2675 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2676                                 struct ocfs2_path *path,
2677                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
2678                                 struct ocfs2_extent_tree *et)
2679 {
2680         int ret, subtree_index;
2681         u32 cpos;
2682         struct ocfs2_path *left_path = NULL;
2683         struct ocfs2_extent_block *eb;
2684         struct ocfs2_extent_list *el;
2685
2686
2687         ret = ocfs2_et_sanity_check(inode, et);
2688         if (ret)
2689                 goto out;
2690         /*
2691          * There's two ways we handle this depending on
2692          * whether path is the only existing one.
2693          */
2694         ret = ocfs2_extend_rotate_transaction(handle, 0,
2695                                               handle->h_buffer_credits,
2696                                               path);
2697         if (ret) {
2698                 mlog_errno(ret);
2699                 goto out;
2700         }
2701
2702         ret = ocfs2_journal_access_path(inode, handle, path);
2703         if (ret) {
2704                 mlog_errno(ret);
2705                 goto out;
2706         }
2707
2708         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2709         if (ret) {
2710                 mlog_errno(ret);
2711                 goto out;
2712         }
2713
2714         if (cpos) {
2715                 /*
2716                  * We have a path to the left of this one - it needs
2717                  * an update too.
2718                  */
2719                 left_path = ocfs2_new_path(path_root_bh(path),
2720                                            path_root_el(path));
2721                 if (!left_path) {
2722                         ret = -ENOMEM;
2723                         mlog_errno(ret);
2724                         goto out;
2725                 }
2726
2727                 ret = ocfs2_find_path(inode, left_path, cpos);
2728                 if (ret) {
2729                         mlog_errno(ret);
2730                         goto out;
2731                 }
2732
2733                 ret = ocfs2_journal_access_path(inode, handle, left_path);
2734                 if (ret) {
2735                         mlog_errno(ret);
2736                         goto out;
2737                 }
2738
2739                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2740
2741                 ocfs2_unlink_subtree(inode, handle, left_path, path,
2742                                      subtree_index, dealloc);
2743                 ocfs2_update_edge_lengths(inode, handle, left_path);
2744
2745                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2746                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2747         } else {
2748                 /*
2749                  * 'path' is also the leftmost path which
2750                  * means it must be the only one. This gets
2751                  * handled differently because we want to
2752                  * revert the inode back to having extents
2753                  * in-line.
2754                  */
2755                 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2756
2757                 el = et->et_root_el;
2758                 el->l_tree_depth = 0;
2759                 el->l_next_free_rec = 0;
2760                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2761
2762                 ocfs2_et_set_last_eb_blk(et, 0);
2763         }
2764
2765         ocfs2_journal_dirty(handle, path_root_bh(path));
2766
2767 out:
2768         ocfs2_free_path(left_path);
2769         return ret;
2770 }
2771
2772 /*
2773  * Left rotation of btree records.
2774  *
2775  * In many ways, this is (unsurprisingly) the opposite of right
2776  * rotation. We start at some non-rightmost path containing an empty
2777  * extent in the leaf block. The code works its way to the rightmost
2778  * path by rotating records to the left in every subtree.
2779  *
2780  * This is used by any code which reduces the number of extent records
2781  * in a leaf. After removal, an empty record should be placed in the
2782  * leftmost list position.
2783  *
2784  * This won't handle a length update of the rightmost path records if
2785  * the rightmost tree leaf record is removed so the caller is
2786  * responsible for detecting and correcting that.
2787  */
2788 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2789                                   struct ocfs2_path *path,
2790                                   struct ocfs2_cached_dealloc_ctxt *dealloc,
2791                                   struct ocfs2_extent_tree *et)
2792 {
2793         int ret, orig_credits = handle->h_buffer_credits;
2794         struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2795         struct ocfs2_extent_block *eb;
2796         struct ocfs2_extent_list *el;
2797
2798         el = path_leaf_el(path);
2799         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2800                 return 0;
2801
2802         if (path->p_tree_depth == 0) {
2803 rightmost_no_delete:
2804                 /*
2805                  * Inline extents. This is trivially handled, so do
2806                  * it up front.
2807                  */
2808                 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2809                                                        path_leaf_bh(path),
2810                                                        path_leaf_el(path));
2811                 if (ret)
2812                         mlog_errno(ret);
2813                 goto out;
2814         }
2815
2816         /*
2817          * Handle rightmost branch now. There's several cases:
2818          *  1) simple rotation leaving records in there. That's trivial.
2819          *  2) rotation requiring a branch delete - there's no more
2820          *     records left. Two cases of this:
2821          *     a) There are branches to the left.
2822          *     b) This is also the leftmost (the only) branch.
2823          *
2824          *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
2825          *  2a) we need the left branch so that we can update it with the unlink
2826          *  2b) we need to bring the inode back to inline extents.
2827          */
2828
2829         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2830         el = &eb->h_list;
2831         if (eb->h_next_leaf_blk == 0) {
2832                 /*
2833                  * This gets a bit tricky if we're going to delete the
2834                  * rightmost path. Get the other cases out of the way
2835                  * 1st.
2836                  */
2837                 if (le16_to_cpu(el->l_next_free_rec) > 1)
2838                         goto rightmost_no_delete;
2839
2840                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
2841                         ret = -EIO;
2842                         ocfs2_error(inode->i_sb,
2843                                     "Inode %llu has empty extent block at %llu",
2844                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
2845                                     (unsigned long long)le64_to_cpu(eb->h_blkno));
2846                         goto out;
2847                 }
2848
2849                 /*
2850                  * XXX: The caller can not trust "path" any more after
2851                  * this as it will have been deleted. What do we do?
2852                  *
2853                  * In theory the rotate-for-merge code will never get
2854                  * here because it'll always ask for a rotate in a
2855                  * nonempty list.
2856                  */
2857
2858                 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2859                                                   dealloc, et);
2860                 if (ret)
2861                         mlog_errno(ret);
2862                 goto out;
2863         }
2864
2865         /*
2866          * Now we can loop, remembering the path we get from -EAGAIN
2867          * and restarting from there.
2868          */
2869 try_rotate:
2870         ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2871                                        dealloc, &restart_path, et);
2872         if (ret && ret != -EAGAIN) {
2873                 mlog_errno(ret);
2874                 goto out;
2875         }
2876
2877         while (ret == -EAGAIN) {
2878                 tmp_path = restart_path;
2879                 restart_path = NULL;
2880
2881                 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2882                                                tmp_path, dealloc,
2883                                                &restart_path, et);
2884                 if (ret && ret != -EAGAIN) {
2885                         mlog_errno(ret);
2886                         goto out;
2887                 }
2888
2889                 ocfs2_free_path(tmp_path);
2890                 tmp_path = NULL;
2891
2892                 if (ret == 0)
2893                         goto try_rotate;
2894         }
2895
2896 out:
2897         ocfs2_free_path(tmp_path);
2898         ocfs2_free_path(restart_path);
2899         return ret;
2900 }
2901
2902 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2903                                 int index)
2904 {
2905         struct ocfs2_extent_rec *rec = &el->l_recs[index];
2906         unsigned int size;
2907
2908         if (rec->e_leaf_clusters == 0) {
2909                 /*
2910                  * We consumed all of the merged-from record. An empty
2911                  * extent cannot exist anywhere but the 1st array
2912                  * position, so move things over if the merged-from
2913                  * record doesn't occupy that position.
2914                  *
2915                  * This creates a new empty extent so the caller
2916                  * should be smart enough to have removed any existing
2917                  * ones.
2918                  */
2919                 if (index > 0) {
2920                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2921                         size = index * sizeof(struct ocfs2_extent_rec);
2922                         memmove(&el->l_recs[1], &el->l_recs[0], size);
2923                 }
2924
2925                 /*
2926                  * Always memset - the caller doesn't check whether it
2927                  * created an empty extent, so there could be junk in
2928                  * the other fields.
2929                  */
2930                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2931         }
2932 }
2933
2934 static int ocfs2_get_right_path(struct inode *inode,
2935                                 struct ocfs2_path *left_path,
2936                                 struct ocfs2_path **ret_right_path)
2937 {
2938         int ret;
2939         u32 right_cpos;
2940         struct ocfs2_path *right_path = NULL;
2941         struct ocfs2_extent_list *left_el;
2942
2943         *ret_right_path = NULL;
2944
2945         /* This function shouldn't be called for non-trees. */
2946         BUG_ON(left_path->p_tree_depth == 0);
2947
2948         left_el = path_leaf_el(left_path);
2949         BUG_ON(left_el->l_next_free_rec != left_el->l_count);
2950
2951         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2952                                              &right_cpos);
2953         if (ret) {
2954                 mlog_errno(ret);
2955                 goto out;
2956         }
2957
2958         /* This function shouldn't be called for the rightmost leaf. */
2959         BUG_ON(right_cpos == 0);
2960
2961         right_path = ocfs2_new_path(path_root_bh(left_path),
2962                                     path_root_el(left_path));
2963         if (!right_path) {
2964                 ret = -ENOMEM;
2965                 mlog_errno(ret);
2966                 goto out;
2967         }
2968
2969         ret = ocfs2_find_path(inode, right_path, right_cpos);
2970         if (ret) {
2971                 mlog_errno(ret);
2972                 goto out;
2973         }
2974
2975         *ret_right_path = right_path;
2976 out:
2977         if (ret)
2978                 ocfs2_free_path(right_path);
2979         return ret;
2980 }
2981
2982 /*
2983  * Remove split_rec clusters from the record at index and merge them
2984  * onto the beginning of the record "next" to it.
2985  * For index < l_count - 1, the next means the extent rec at index + 1.
2986  * For index == l_count - 1, the "next" means the 1st extent rec of the
2987  * next extent block.
2988  */
2989 static int ocfs2_merge_rec_right(struct inode *inode,
2990                                  struct ocfs2_path *left_path,
2991                                  handle_t *handle,
2992                                  struct ocfs2_extent_rec *split_rec,
2993                                  int index)
2994 {
2995         int ret, next_free, i;
2996         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2997         struct ocfs2_extent_rec *left_rec;
2998         struct ocfs2_extent_rec *right_rec;
2999         struct ocfs2_extent_list *right_el;
3000         struct ocfs2_path *right_path = NULL;
3001         int subtree_index = 0;
3002         struct ocfs2_extent_list *el = path_leaf_el(left_path);
3003         struct buffer_head *bh = path_leaf_bh(left_path);
3004         struct buffer_head *root_bh = NULL;
3005
3006         BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3007         left_rec = &el->l_recs[index];
3008
3009         if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3010             le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3011                 /* we meet with a cross extent block merge. */
3012                 ret = ocfs2_get_right_path(inode, left_path, &right_path);
3013                 if (ret) {
3014                         mlog_errno(ret);
3015                         goto out;
3016                 }
3017
3018                 right_el = path_leaf_el(right_path);
3019                 next_free = le16_to_cpu(right_el->l_next_free_rec);
3020                 BUG_ON(next_free <= 0);
3021                 right_rec = &right_el->l_recs[0];
3022                 if (ocfs2_is_empty_extent(right_rec)) {
3023                         BUG_ON(next_free <= 1);
3024                         right_rec = &right_el->l_recs[1];
3025                 }
3026
3027                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3028                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3029                        le32_to_cpu(right_rec->e_cpos));
3030
3031                 subtree_index = ocfs2_find_subtree_root(inode,
3032                                                         left_path, right_path);
3033
3034                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3035                                                       handle->h_buffer_credits,
3036                                                       right_path);
3037                 if (ret) {
3038                         mlog_errno(ret);
3039                         goto out;
3040                 }
3041
3042                 root_bh = left_path->p_node[subtree_index].bh;
3043                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3044
3045                 ret = ocfs2_journal_access(handle, inode, root_bh,
3046                                            OCFS2_JOURNAL_ACCESS_WRITE);
3047                 if (ret) {
3048                         mlog_errno(ret);
3049                         goto out;
3050                 }
3051
3052                 for (i = subtree_index + 1;
3053                      i < path_num_items(right_path); i++) {
3054                         ret = ocfs2_journal_access(handle, inode,
3055                                                    right_path->p_node[i].bh,
3056                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3057                         if (ret) {
3058                                 mlog_errno(ret);
3059                                 goto out;
3060                         }
3061
3062                         ret = ocfs2_journal_access(handle, inode,
3063                                                    left_path->p_node[i].bh,
3064                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3065                         if (ret) {
3066                                 mlog_errno(ret);
3067                                 goto out;
3068                         }
3069                 }
3070
3071         } else {
3072                 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3073                 right_rec = &el->l_recs[index + 1];
3074         }
3075
3076         ret = ocfs2_journal_access(handle, inode, bh,
3077                                    OCFS2_JOURNAL_ACCESS_WRITE);
3078         if (ret) {
3079                 mlog_errno(ret);
3080                 goto out;
3081         }
3082
3083         le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3084
3085         le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3086         le64_add_cpu(&right_rec->e_blkno,
3087                      -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3088         le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3089
3090         ocfs2_cleanup_merge(el, index);
3091
3092         ret = ocfs2_journal_dirty(handle, bh);
3093         if (ret)
3094                 mlog_errno(ret);
3095
3096         if (right_path) {
3097                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3098                 if (ret)
3099                         mlog_errno(ret);
3100
3101                 ocfs2_complete_edge_insert(inode, handle, left_path,
3102                                            right_path, subtree_index);
3103         }
3104 out:
3105         if (right_path)
3106                 ocfs2_free_path(right_path);
3107         return ret;
3108 }
3109
3110 static int ocfs2_get_left_path(struct inode *inode,
3111                                struct ocfs2_path *right_path,
3112                                struct ocfs2_path **ret_left_path)
3113 {
3114         int ret;
3115         u32 left_cpos;
3116         struct ocfs2_path *left_path = NULL;
3117
3118         *ret_left_path = NULL;
3119
3120         /* This function shouldn't be called for non-trees. */
3121         BUG_ON(right_path->p_tree_depth == 0);
3122
3123         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3124                                             right_path, &left_cpos);
3125         if (ret) {
3126                 mlog_errno(ret);
3127                 goto out;
3128         }
3129
3130         /* This function shouldn't be called for the leftmost leaf. */
3131         BUG_ON(left_cpos == 0);
3132
3133         left_path = ocfs2_new_path(path_root_bh(right_path),
3134                                    path_root_el(right_path));
3135         if (!left_path) {
3136                 ret = -ENOMEM;
3137                 mlog_errno(ret);
3138                 goto out;
3139         }
3140
3141         ret = ocfs2_find_path(inode, left_path, left_cpos);
3142         if (ret) {
3143                 mlog_errno(ret);
3144                 goto out;
3145         }
3146
3147         *ret_left_path = left_path;
3148 out:
3149         if (ret)
3150                 ocfs2_free_path(left_path);
3151         return ret;
3152 }
3153
3154 /*
3155  * Remove split_rec clusters from the record at index and merge them
3156  * onto the tail of the record "before" it.
3157  * For index > 0, the "before" means the extent rec at index - 1.
3158  *
3159  * For index == 0, the "before" means the last record of the previous
3160  * extent block. And there is also a situation that we may need to
3161  * remove the rightmost leaf extent block in the right_path and change
3162  * the right path to indicate the new rightmost path.
3163  */
3164 static int ocfs2_merge_rec_left(struct inode *inode,
3165                                 struct ocfs2_path *right_path,
3166                                 handle_t *handle,
3167                                 struct ocfs2_extent_rec *split_rec,
3168                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
3169                                 struct ocfs2_extent_tree *et,
3170                                 int index)
3171 {
3172         int ret, i, subtree_index = 0, has_empty_extent = 0;
3173         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3174         struct ocfs2_extent_rec *left_rec;
3175         struct ocfs2_extent_rec *right_rec;
3176         struct ocfs2_extent_list *el = path_leaf_el(right_path);
3177         struct buffer_head *bh = path_leaf_bh(right_path);
3178         struct buffer_head *root_bh = NULL;
3179         struct ocfs2_path *left_path = NULL;
3180         struct ocfs2_extent_list *left_el;
3181
3182         BUG_ON(index < 0);
3183
3184         right_rec = &el->l_recs[index];
3185         if (index == 0) {
3186                 /* we meet with a cross extent block merge. */
3187                 ret = ocfs2_get_left_path(inode, right_path, &left_path);
3188                 if (ret) {
3189                         mlog_errno(ret);
3190                         goto out;
3191                 }
3192
3193                 left_el = path_leaf_el(left_path);
3194                 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3195                        le16_to_cpu(left_el->l_count));
3196
3197                 left_rec = &left_el->l_recs[
3198                                 le16_to_cpu(left_el->l_next_free_rec) - 1];
3199                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3200                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3201                        le32_to_cpu(split_rec->e_cpos));
3202
3203                 subtree_index = ocfs2_find_subtree_root(inode,
3204                                                         left_path, right_path);
3205
3206                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3207                                                       handle->h_buffer_credits,
3208                                                       left_path);
3209                 if (ret) {
3210                         mlog_errno(ret);
3211                         goto out;
3212                 }
3213
3214                 root_bh = left_path->p_node[subtree_index].bh;
3215                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3216
3217                 ret = ocfs2_journal_access(handle, inode, root_bh,
3218                                            OCFS2_JOURNAL_ACCESS_WRITE);
3219                 if (ret) {
3220                         mlog_errno(ret);
3221                         goto out;
3222                 }
3223
3224                 for (i = subtree_index + 1;
3225                      i < path_num_items(right_path); i++) {
3226                         ret = ocfs2_journal_access(handle, inode,
3227                                                    right_path->p_node[i].bh,
3228                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3229                         if (ret) {
3230                                 mlog_errno(ret);
3231                                 goto out;
3232                         }
3233
3234                         ret = ocfs2_journal_access(handle, inode,
3235                                                    left_path->p_node[i].bh,
3236                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3237                         if (ret) {
3238                                 mlog_errno(ret);
3239                                 goto out;
3240                         }
3241                 }
3242         } else {
3243                 left_rec = &el->l_recs[index - 1];
3244                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3245                         has_empty_extent = 1;
3246         }
3247
3248         ret = ocfs2_journal_access(handle, inode, bh,
3249                                    OCFS2_JOURNAL_ACCESS_WRITE);
3250         if (ret) {
3251                 mlog_errno(ret);
3252                 goto out;
3253         }
3254
3255         if (has_empty_extent && index == 1) {
3256                 /*
3257                  * The easy case - we can just plop the record right in.
3258                  */
3259                 *left_rec = *split_rec;
3260
3261                 has_empty_extent = 0;
3262         } else
3263                 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3264
3265         le32_add_cpu(&right_rec->e_cpos, split_clusters);
3266         le64_add_cpu(&right_rec->e_blkno,
3267                      ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3268         le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3269
3270         ocfs2_cleanup_merge(el, index);
3271
3272         ret = ocfs2_journal_dirty(handle, bh);
3273         if (ret)
3274                 mlog_errno(ret);
3275
3276         if (left_path) {
3277                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3278                 if (ret)
3279                         mlog_errno(ret);
3280
3281                 /*
3282                  * In the situation that the right_rec is empty and the extent
3283                  * block is empty also,  ocfs2_complete_edge_insert can't handle
3284                  * it and we need to delete the right extent block.
3285                  */
3286                 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3287                     le16_to_cpu(el->l_next_free_rec) == 1) {
3288
3289                         ret = ocfs2_remove_rightmost_path(inode, handle,
3290                                                           right_path,
3291                                                           dealloc, et);
3292                         if (ret) {
3293                                 mlog_errno(ret);
3294                                 goto out;
3295                         }
3296
3297                         /* Now the rightmost extent block has been deleted.
3298                          * So we use the new rightmost path.
3299                          */
3300                         ocfs2_mv_path(right_path, left_path);
3301                         left_path = NULL;
3302                 } else
3303                         ocfs2_complete_edge_insert(inode, handle, left_path,
3304                                                    right_path, subtree_index);
3305         }
3306 out:
3307         if (left_path)
3308                 ocfs2_free_path(left_path);
3309         return ret;
3310 }
3311
3312 static int ocfs2_try_to_merge_extent(struct inode *inode,
3313                                      handle_t *handle,
3314                                      struct ocfs2_path *path,
3315                                      int split_index,
3316                                      struct ocfs2_extent_rec *split_rec,
3317                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
3318                                      struct ocfs2_merge_ctxt *ctxt,
3319                                      struct ocfs2_extent_tree *et)
3320
3321 {
3322         int ret = 0;
3323         struct ocfs2_extent_list *el = path_leaf_el(path);
3324         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3325
3326         BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3327
3328         if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3329                 /*
3330                  * The merge code will need to create an empty
3331                  * extent to take the place of the newly
3332                  * emptied slot. Remove any pre-existing empty
3333                  * extents - having more than one in a leaf is
3334                  * illegal.
3335                  */
3336                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3337                                              dealloc, et);
3338                 if (ret) {
3339                         mlog_errno(ret);
3340                         goto out;
3341                 }
3342                 split_index--;
3343                 rec = &el->l_recs[split_index];
3344         }
3345
3346         if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3347                 /*
3348                  * Left-right contig implies this.
3349                  */
3350                 BUG_ON(!ctxt->c_split_covers_rec);
3351
3352                 /*
3353                  * Since the leftright insert always covers the entire
3354                  * extent, this call will delete the insert record
3355                  * entirely, resulting in an empty extent record added to
3356                  * the extent block.
3357                  *
3358                  * Since the adding of an empty extent shifts
3359                  * everything back to the right, there's no need to
3360                  * update split_index here.
3361                  *
3362                  * When the split_index is zero, we need to merge it to the
3363                  * prevoius extent block. It is more efficient and easier
3364                  * if we do merge_right first and merge_left later.
3365                  */
3366                 ret = ocfs2_merge_rec_right(inode, path,
3367                                             handle, split_rec,
3368                                             split_index);
3369                 if (ret) {
3370                         mlog_errno(ret);
3371                         goto out;
3372                 }
3373
3374                 /*
3375                  * We can only get this from logic error above.
3376                  */
3377                 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3378
3379                 /* The merge left us with an empty extent, remove it. */
3380                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3381                                              dealloc, et);
3382                 if (ret) {
3383                         mlog_errno(ret);
3384                         goto out;
3385                 }
3386
3387                 rec = &el->l_recs[split_index];
3388
3389                 /*
3390                  * Note that we don't pass split_rec here on purpose -
3391                  * we've merged it into the rec already.
3392                  */
3393                 ret = ocfs2_merge_rec_left(inode, path,
3394                                            handle, rec,
3395                                            dealloc, et,
3396                                            split_index);
3397
3398                 if (ret) {
3399                         mlog_errno(ret);
3400                         goto out;
3401                 }
3402
3403                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3404                                              dealloc, et);
3405                 /*
3406                  * Error from this last rotate is not critical, so
3407                  * print but don't bubble it up.
3408                  */
3409                 if (ret)
3410                         mlog_errno(ret);
3411                 ret = 0;
3412         } else {
3413                 /*
3414                  * Merge a record to the left or right.
3415                  *
3416                  * 'contig_type' is relative to the existing record,
3417                  * so for example, if we're "right contig", it's to
3418                  * the record on the left (hence the left merge).
3419                  */
3420                 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3421                         ret = ocfs2_merge_rec_left(inode,
3422                                                    path,
3423                                                    handle, split_rec,
3424                                                    dealloc, et,
3425                                                    split_index);
3426                         if (ret) {
3427                                 mlog_errno(ret);
3428                                 goto out;
3429                         }
3430                 } else {
3431                         ret = ocfs2_merge_rec_right(inode,
3432                                                     path,
3433                                                     handle, split_rec,
3434                                                     split_index);
3435                         if (ret) {
3436                                 mlog_errno(ret);
3437                                 goto out;
3438                         }
3439                 }
3440
3441                 if (ctxt->c_split_covers_rec) {
3442                         /*
3443                          * The merge may have left an empty extent in
3444                          * our leaf. Try to rotate it away.
3445                          */
3446                         ret = ocfs2_rotate_tree_left(inode, handle, path,
3447                                                      dealloc, et);
3448                         if (ret)
3449                                 mlog_errno(ret);
3450                         ret = 0;
3451                 }
3452         }
3453
3454 out:
3455         return ret;
3456 }
3457
3458 static void ocfs2_subtract_from_rec(struct super_block *sb,
3459                                     enum ocfs2_split_type split,
3460                                     struct ocfs2_extent_rec *rec,
3461                                     struct ocfs2_extent_rec *split_rec)
3462 {
3463         u64 len_blocks;
3464
3465         len_blocks = ocfs2_clusters_to_blocks(sb,
3466                                 le16_to_cpu(split_rec->e_leaf_clusters));
3467
3468         if (split == SPLIT_LEFT) {
3469                 /*
3470                  * Region is on the left edge of the existing
3471                  * record.
3472                  */
3473                 le32_add_cpu(&rec->e_cpos,
3474                              le16_to_cpu(split_rec->e_leaf_clusters));
3475                 le64_add_cpu(&rec->e_blkno, len_blocks);
3476                 le16_add_cpu(&rec->e_leaf_clusters,
3477                              -le16_to_cpu(split_rec->e_leaf_clusters));
3478         } else {
3479                 /*
3480                  * Region is on the right edge of the existing
3481                  * record.
3482                  */
3483                 le16_add_cpu(&rec->e_leaf_clusters,
3484                              -le16_to_cpu(split_rec->e_leaf_clusters));
3485         }
3486 }
3487
3488 /*
3489  * Do the final bits of extent record insertion at the target leaf
3490  * list. If this leaf is part of an allocation tree, it is assumed
3491  * that the tree above has been prepared.
3492  */
3493 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3494                                  struct ocfs2_extent_list *el,
3495                                  struct ocfs2_insert_type *insert,
3496                                  struct inode *inode)
3497 {
3498         int i = insert->ins_contig_index;
3499         unsigned int range;
3500         struct ocfs2_extent_rec *rec;
3501
3502         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3503
3504         if (insert->ins_split != SPLIT_NONE) {
3505                 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3506                 BUG_ON(i == -1);
3507                 rec = &el->l_recs[i];
3508                 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3509                                         insert_rec);
3510                 goto rotate;
3511         }
3512
3513         /*
3514          * Contiguous insert - either left or right.
3515          */
3516         if (insert->ins_contig != CONTIG_NONE) {
3517                 rec = &el->l_recs[i];
3518                 if (insert->ins_contig == CONTIG_LEFT) {
3519                         rec->e_blkno = insert_rec->e_blkno;
3520                         rec->e_cpos = insert_rec->e_cpos;
3521                 }
3522                 le16_add_cpu(&rec->e_leaf_clusters,
3523                              le16_to_cpu(insert_rec->e_leaf_clusters));
3524                 return;
3525         }
3526
3527         /*
3528          * Handle insert into an empty leaf.
3529          */
3530         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3531             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3532              ocfs2_is_empty_extent(&el->l_recs[0]))) {
3533                 el->l_recs[0] = *insert_rec;
3534                 el->l_next_free_rec = cpu_to_le16(1);
3535                 return;
3536         }
3537
3538         /*
3539          * Appending insert.
3540          */
3541         if (insert->ins_appending == APPEND_TAIL) {
3542                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3543                 rec = &el->l_recs[i];
3544                 range = le32_to_cpu(rec->e_cpos)
3545                         + le16_to_cpu(rec->e_leaf_clusters);
3546                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3547
3548                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3549                                 le16_to_cpu(el->l_count),
3550                                 "inode %lu, depth %u, count %u, next free %u, "
3551                                 "rec.cpos %u, rec.clusters %u, "
3552                                 "insert.cpos %u, insert.clusters %u\n",
3553                                 inode->i_ino,
3554                                 le16_to_cpu(el->l_tree_depth),
3555                                 le16_to_cpu(el->l_count),
3556                                 le16_to_cpu(el->l_next_free_rec),
3557                                 le32_to_cpu(el->l_recs[i].e_cpos),
3558                                 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3559                                 le32_to_cpu(insert_rec->e_cpos),
3560                                 le16_to_cpu(insert_rec->e_leaf_clusters));
3561                 i++;
3562                 el->l_recs[i] = *insert_rec;
3563                 le16_add_cpu(&el->l_next_free_rec, 1);
3564                 return;
3565         }
3566
3567 rotate:
3568         /*
3569          * Ok, we have to rotate.
3570          *
3571          * At this point, it is safe to assume that inserting into an
3572          * empty leaf and appending to a leaf have both been handled
3573          * above.
3574          *
3575          * This leaf needs to have space, either by the empty 1st
3576          * extent record, or by virtue of an l_next_rec < l_count.
3577          */
3578         ocfs2_rotate_leaf(el, insert_rec);
3579 }
3580
3581 static void ocfs2_adjust_rightmost_records(struct inode *inode,
3582                                            handle_t *handle,
3583                                            struct ocfs2_path *path,
3584                                            struct ocfs2_extent_rec *insert_rec)
3585 {
3586         int ret, i, next_free;
3587         struct buffer_head *bh;
3588         struct ocfs2_extent_list *el;
3589         struct ocfs2_extent_rec *rec;
3590
3591         /*
3592          * Update everything except the leaf block.
3593          */
3594         for (i = 0; i < path->p_tree_depth; i++) {
3595                 bh = path->p_node[i].bh;
3596                 el = path->p_node[i].el;
3597
3598                 next_free = le16_to_cpu(el->l_next_free_rec);
3599                 if (next_free == 0) {
3600                         ocfs2_error(inode->i_sb,
3601                                     "Dinode %llu has a bad extent list",
3602                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
3603                         ret = -EIO;
3604                         return;
3605                 }
3606
3607                 rec = &el->l_recs[next_free - 1];
3608
3609                 rec->e_int_clusters = insert_rec->e_cpos;
3610                 le32_add_cpu(&rec->e_int_clusters,
3611                              le16_to_cpu(insert_rec->e_leaf_clusters));
3612                 le32_add_cpu(&rec->e_int_clusters,
3613                              -le32_to_cpu(rec->e_cpos));
3614
3615                 ret = ocfs2_journal_dirty(handle, bh);
3616                 if (ret)
3617                         mlog_errno(ret);
3618
3619         }
3620 }
3621
3622 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3623                                     struct ocfs2_extent_rec *insert_rec,
3624                                     struct ocfs2_path *right_path,
3625                                     struct ocfs2_path **ret_left_path)
3626 {
3627         int ret, next_free;
3628         struct ocfs2_extent_list *el;
3629         struct ocfs2_path *left_path = NULL;
3630
3631         *ret_left_path = NULL;
3632
3633         /*
3634          * This shouldn't happen for non-trees. The extent rec cluster
3635          * count manipulation below only works for interior nodes.
3636          */
3637         BUG_ON(right_path->p_tree_depth == 0);
3638
3639         /*
3640          * If our appending insert is at the leftmost edge of a leaf,
3641          * then we might need to update the rightmost records of the
3642          * neighboring path.
3643          */
3644         el = path_leaf_el(right_path);
3645         next_free = le16_to_cpu(el->l_next_free_rec);
3646         if (next_free == 0 ||
3647             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3648                 u32 left_cpos;
3649
3650                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3651                                                     &left_cpos);
3652                 if (ret) {
3653                         mlog_errno(ret);
3654                         goto out;
3655                 }
3656
3657                 mlog(0, "Append may need a left path update. cpos: %u, "
3658                      "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3659                      left_cpos);
3660
3661                 /*
3662                  * No need to worry if the append is already in the
3663                  * leftmost leaf.
3664                  */
3665                 if (left_cpos) {
3666                         left_path = ocfs2_new_path(path_root_bh(right_path),
3667                                                    path_root_el(right_path));
3668                         if (!left_path) {
3669                                 ret = -ENOMEM;
3670                                 mlog_errno(ret);
3671                                 goto out;
3672                         }
3673
3674                         ret = ocfs2_find_path(inode, left_path, left_cpos);
3675                         if (ret) {
3676                                 mlog_errno(ret);
3677                                 goto out;
3678                         }
3679
3680                         /*
3681                          * ocfs2_insert_path() will pass the left_path to the
3682                          * journal for us.
3683                          */
3684                 }
3685         }
3686
3687         ret = ocfs2_journal_access_path(inode, handle, right_path);
3688         if (ret) {
3689                 mlog_errno(ret);
3690                 goto out;
3691         }
3692
3693         ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3694
3695         *ret_left_path = left_path;
3696         ret = 0;
3697 out:
3698         if (ret != 0)
3699                 ocfs2_free_path(left_path);
3700
3701         return ret;
3702 }
3703
3704 static void ocfs2_split_record(struct inode *inode,
3705                                struct ocfs2_path *left_path,
3706                                struct ocfs2_path *right_path,
3707                                struct ocfs2_extent_rec *split_rec,
3708                                enum ocfs2_split_type split)
3709 {
3710         int index;
3711         u32 cpos = le32_to_cpu(split_rec->e_cpos);
3712         struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3713         struct ocfs2_extent_rec *rec, *tmprec;
3714
3715         right_el = path_leaf_el(right_path);;
3716         if (left_path)
3717                 left_el = path_leaf_el(left_path);
3718
3719         el = right_el;
3720         insert_el = right_el;
3721         index = ocfs2_search_extent_list(el, cpos);
3722         if (index != -1) {
3723                 if (index == 0 && left_path) {
3724                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3725
3726                         /*
3727                          * This typically means that the record
3728                          * started in the left path but moved to the
3729                          * right as a result of rotation. We either
3730                          * move the existing record to the left, or we
3731                          * do the later insert there.
3732                          *
3733                          * In this case, the left path should always
3734                          * exist as the rotate code will have passed
3735                          * it back for a post-insert update.
3736                          */
3737
3738                         if (split == SPLIT_LEFT) {
3739                                 /*
3740                                  * It's a left split. Since we know
3741                                  * that the rotate code gave us an
3742                                  * empty extent in the left path, we
3743                                  * can just do the insert there.
3744                                  */
3745                                 insert_el = left_el;
3746                         } else {
3747                                 /*
3748                                  * Right split - we have to move the
3749                                  * existing record over to the left
3750                                  * leaf. The insert will be into the
3751                                  * newly created empty extent in the
3752                                  * right leaf.
3753                                  */
3754                                 tmprec = &right_el->l_recs[index];
3755                                 ocfs2_rotate_leaf(left_el, tmprec);
3756                                 el = left_el;
3757
3758                                 memset(tmprec, 0, sizeof(*tmprec));
3759                                 index = ocfs2_search_extent_list(left_el, cpos);
3760                                 BUG_ON(index == -1);
3761                         }
3762                 }
3763         } else {
3764                 BUG_ON(!left_path);
3765                 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3766                 /*
3767                  * Left path is easy - we can just allow the insert to
3768                  * happen.
3769                  */
3770                 el = left_el;
3771                 insert_el = left_el;
3772                 index = ocfs2_search_extent_list(el, cpos);
3773                 BUG_ON(index == -1);
3774         }
3775
3776         rec = &el->l_recs[index];
3777         ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3778         ocfs2_rotate_leaf(insert_el, split_rec);
3779 }
3780
3781 /*
3782  * This function only does inserts on an allocation b-tree. For tree
3783  * depth = 0, ocfs2_insert_at_leaf() is called directly.
3784  *
3785  * right_path is the path we want to do the actual insert
3786  * in. left_path should only be passed in if we need to update that
3787  * portion of the tree after an edge insert.
3788  */
3789 static int ocfs2_insert_path(struct inode *inode,
3790                              handle_t *handle,
3791                              struct ocfs2_path *left_path,
3792                              struct ocfs2_path *right_path,
3793                              struct ocfs2_extent_rec *insert_rec,
3794                              struct ocfs2_insert_type *insert)
3795 {
3796         int ret, subtree_index;
3797         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3798
3799         if (left_path) {
3800                 int credits = handle->h_buffer_credits;
3801
3802                 /*
3803                  * There's a chance that left_path got passed back to
3804                  * us without being accounted for in the
3805                  * journal. Extend our transaction here to be sure we
3806                  * can change those blocks.
3807                  */
3808                 credits += left_path->p_tree_depth;
3809
3810                 ret = ocfs2_extend_trans(handle, credits);
3811                 if (ret < 0) {
3812                         mlog_errno(ret);
3813                         goto out;
3814                 }
3815
3816                 ret = ocfs2_journal_access_path(inode, handle, left_path);
3817                 if (ret < 0) {
3818                         mlog_errno(ret);
3819                         goto out;
3820                 }
3821         }
3822
3823         /*
3824          * Pass both paths to the journal. The majority of inserts
3825          * will be touching all components anyway.
3826          */
3827         ret = ocfs2_journal_access_path(inode, handle, right_path);
3828         if (ret < 0) {
3829                 mlog_errno(ret);
3830                 goto out;
3831         }
3832
3833         if (insert->ins_split != SPLIT_NONE) {
3834                 /*
3835                  * We could call ocfs2_insert_at_leaf() for some types
3836                  * of splits, but it's easier to just let one separate
3837                  * function sort it all out.
3838                  */
3839                 ocfs2_split_record(inode, left_path, right_path,
3840                                    insert_rec, insert->ins_split);
3841
3842                 /*
3843                  * Split might have modified either leaf and we don't
3844                  * have a guarantee that the later edge insert will
3845                  * dirty this for us.
3846                  */
3847                 if (left_path)
3848                         ret = ocfs2_journal_dirty(handle,
3849                                                   path_leaf_bh(left_path));
3850                         if (ret)
3851                                 mlog_errno(ret);
3852         } else
3853                 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3854                                      insert, inode);
3855
3856         ret = ocfs2_journal_dirty(handle, leaf_bh);
3857         if (ret)
3858                 mlog_errno(ret);
3859
3860         if (left_path) {
3861                 /*
3862                  * The rotate code has indicated that we need to fix
3863                  * up portions of the tree after the insert.
3864                  *
3865                  * XXX: Should we extend the transaction here?
3866                  */
3867                 subtree_index = ocfs2_find_subtree_root(inode, left_path,
3868                                                         right_path);
3869                 ocfs2_complete_edge_insert(inode, handle, left_path,
3870                                            right_path, subtree_index);
3871         }
3872
3873         ret = 0;
3874 out:
3875         return ret;
3876 }
3877
3878 static int ocfs2_do_insert_extent(struct inode *inode,
3879                                   handle_t *handle,
3880                                   struct ocfs2_extent_tree *et,
3881                                   struct ocfs2_extent_rec *insert_rec,
3882                                   struct ocfs2_insert_type *type)
3883 {
3884         int ret, rotate = 0;
3885         u32 cpos;
3886         struct ocfs2_path *right_path = NULL;
3887         struct ocfs2_path *left_path = NULL;
3888         struct ocfs2_extent_list *el;
3889
3890         el = et->et_root_el;
3891
3892         ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
3893                                    OCFS2_JOURNAL_ACCESS_WRITE);
3894         if (ret) {
3895                 mlog_errno(ret);
3896                 goto out;
3897         }
3898
3899         if (le16_to_cpu(el->l_tree_depth) == 0) {
3900                 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
3901                 goto out_update_clusters;
3902         }
3903
3904         right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
3905         if (!right_path) {
3906                 ret = -ENOMEM;
3907                 mlog_errno(ret);
3908                 goto out;
3909         }
3910
3911         /*
3912          * Determine the path to start with. Rotations need the
3913          * rightmost path, everything else can go directly to the
3914          * target leaf.
3915          */
3916         cpos = le32_to_cpu(insert_rec->e_cpos);
3917         if (type->ins_appending == APPEND_NONE &&
3918             type->ins_contig == CONTIG_NONE) {
3919                 rotate = 1;
3920                 cpos = UINT_MAX;
3921         }
3922
3923         ret = ocfs2_find_path(inode, right_path, cpos);
3924         if (ret) {
3925                 mlog_errno(ret);
3926                 goto out;
3927         }
3928
3929         /*
3930          * Rotations and appends need special treatment - they modify
3931          * parts of the tree's above them.
3932          *
3933          * Both might pass back a path immediate to the left of the
3934          * one being inserted to. This will be cause
3935          * ocfs2_insert_path() to modify the rightmost records of
3936          * left_path to account for an edge insert.
3937          *
3938          * XXX: When modifying this code, keep in mind that an insert
3939          * can wind up skipping both of these two special cases...
3940          */
3941         if (rotate) {
3942                 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
3943                                               le32_to_cpu(insert_rec->e_cpos),
3944                                               right_path, &left_path);
3945                 if (ret) {
3946                         mlog_errno(ret);
3947                         goto out;
3948                 }
3949
3950                 /*
3951                  * ocfs2_rotate_tree_right() might have extended the
3952                  * transaction without re-journaling our tree root.
3953                  */
3954                 ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
3955                                            OCFS2_JOURNAL_ACCESS_WRITE);
3956                 if (ret) {
3957                         mlog_errno(ret);
3958                         goto out;
3959                 }
3960         } else if (type->ins_appending == APPEND_TAIL
3961                    && type->ins_contig != CONTIG_LEFT) {
3962                 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
3963                                                right_path, &left_path);
3964                 if (ret) {
3965                         mlog_errno(ret);
3966                         goto out;
3967                 }
3968         }
3969
3970         ret = ocfs2_insert_path(inode, handle, left_path, right_path,
3971                                 insert_rec, type);
3972         if (ret) {
3973                 mlog_errno(ret);
3974                 goto out;
3975         }
3976
3977 out_update_clusters:
3978         if (type->ins_split == SPLIT_NONE)
3979                 ocfs2_et_update_clusters(inode, et,
3980                                          le16_to_cpu(insert_rec->e_leaf_clusters));
3981
3982         ret = ocfs2_journal_dirty(handle, et->et_root_bh);
3983         if (ret)
3984                 mlog_errno(ret);
3985
3986 out:
3987         ocfs2_free_path(left_path);
3988         ocfs2_free_path(right_path);
3989
3990         return ret;
3991 }
3992
3993 static enum ocfs2_contig_type
3994 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
3995                                struct ocfs2_extent_list *el, int index,
3996                                struct ocfs2_extent_rec *split_rec)
3997 {
3998         int status;
3999         enum ocfs2_contig_type ret = CONTIG_NONE;
4000         u32 left_cpos, right_cpos;
4001         struct ocfs2_extent_rec *rec = NULL;
4002         struct ocfs2_extent_list *new_el;
4003         struct ocfs2_path *left_path = NULL, *right_path = NULL;
4004         struct buffer_head *bh;
4005         struct ocfs2_extent_block *eb;
4006
4007         if (index > 0) {
4008                 rec = &el->l_recs[index - 1];
4009         } else if (path->p_tree_depth > 0) {
4010                 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
4011                                                        path, &left_cpos);
4012                 if (status)
4013                         goto out;
4014
4015                 if (left_cpos != 0) {
4016                         left_path = ocfs2_new_path(path_root_bh(path),
4017                                                    path_root_el(path));
4018                         if (!left_path)
4019                                 goto out;
4020
4021                         status = ocfs2_find_path(inode, left_path, left_cpos);
4022                         if (status)
4023                                 goto out;
4024
4025                         new_el = path_leaf_el(left_path);
4026
4027                         if (le16_to_cpu(new_el->l_next_free_rec) !=
4028                             le16_to_cpu(new_el->l_count)) {
4029                                 bh = path_leaf_bh(left_path);
4030                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4031                                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
4032                                                                  eb);
4033                                 goto out;
4034                         }
4035                         rec = &new_el->l_recs[
4036                                 le16_to_cpu(new_el->l_next_free_rec) - 1];
4037                 }
4038         }
4039
4040         /*
4041          * We're careful to check for an empty extent record here -
4042          * the merge code will know what to do if it sees one.
4043          */
4044         if (rec) {
4045                 if (index == 1 && ocfs2_is_empty_extent(rec)) {
4046                         if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4047                                 ret = CONTIG_RIGHT;
4048                 } else {
4049                         ret = ocfs2_extent_contig(inode, rec, split_rec);
4050                 }
4051         }
4052
4053         rec = NULL;
4054         if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4055                 rec = &el->l_recs[index + 1];
4056         else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4057                  path->p_tree_depth > 0) {
4058                 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
4059                                                         path, &right_cpos);
4060                 if (status)
4061                         goto out;
4062
4063                 if (right_cpos == 0)
4064                         goto out;
4065
4066                 right_path = ocfs2_new_path(path_root_bh(path),
4067                                             path_root_el(path));
4068                 if (!right_path)
4069                         goto out;
4070
4071                 status = ocfs2_find_path(inode, right_path, right_cpos);
4072                 if (status)
4073                         goto out;
4074
4075                 new_el = path_leaf_el(right_path);
4076                 rec = &new_el->l_recs[0];
4077                 if (ocfs2_is_empty_extent(rec)) {
4078                         if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4079                                 bh = path_leaf_bh(right_path);
4080                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4081                                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
4082                                                                  eb);
4083                                 goto out;
4084                         }
4085                         rec = &new_el->l_recs[1];
4086                 }
4087         }
4088
4089         if (rec) {
4090                 enum ocfs2_contig_type contig_type;
4091
4092                 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
4093
4094                 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4095                         ret = CONTIG_LEFTRIGHT;
4096                 else if (ret == CONTIG_NONE)
4097                         ret = contig_type;
4098         }
4099
4100 out:
4101         if (left_path)
4102                 ocfs2_free_path(left_path);
4103         if (right_path)
4104                 ocfs2_free_path(right_path);
4105
4106         return ret;
4107 }
4108
4109 static void ocfs2_figure_contig_type(struct inode *inode,
4110                                      struct ocfs2_insert_type *insert,
4111                                      struct ocfs2_extent_list *el,
4112                                      struct ocfs2_extent_rec *insert_rec,
4113                                      struct ocfs2_extent_tree *et)
4114 {
4115         int i;
4116         enum ocfs2_contig_type contig_type = CONTIG_NONE;
4117
4118         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4119
4120         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4121                 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
4122                                                   insert_rec);
4123                 if (contig_type != CONTIG_NONE) {
4124                         insert->ins_contig_index = i;
4125                         break;
4126                 }
4127         }
4128         insert->ins_contig = contig_type;
4129
4130         if (insert->ins_contig != CONTIG_NONE) {
4131                 struct ocfs2_extent_rec *rec =
4132                                 &el->l_recs[insert->ins_contig_index];
4133                 unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4134                                    le16_to_cpu(insert_rec->e_leaf_clusters);
4135
4136                 /*
4137                  * Caller might want us to limit the size of extents, don't
4138                  * calculate contiguousness if we might exceed that limit.
4139                  */
4140                 if (et->et_max_leaf_clusters &&
4141                     (len > et->et_max_leaf_clusters))
4142                         insert->ins_contig = CONTIG_NONE;
4143         }
4144 }
4145
4146 /*
4147  * This should only be called against the righmost leaf extent list.
4148  *
4149  * ocfs2_figure_appending_type() will figure out whether we'll have to
4150  * insert at the tail of the rightmost leaf.
4151  *
4152  * This should also work against the root extent list for tree's with 0
4153  * depth. If we consider the root extent list to be the rightmost leaf node
4154  * then the logic here makes sense.
4155  */
4156 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4157                                         struct ocfs2_extent_list *el,
4158                                         struct ocfs2_extent_rec *insert_rec)
4159 {
4160         int i;
4161         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4162         struct ocfs2_extent_rec *rec;
4163
4164         insert->ins_appending = APPEND_NONE;
4165
4166         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4167
4168         if (!el->l_next_free_rec)
4169                 goto set_tail_append;
4170
4171         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4172                 /* Were all records empty? */
4173                 if (le16_to_cpu(el->l_next_free_rec) == 1)
4174                         goto set_tail_append;
4175         }
4176
4177         i = le16_to_cpu(el->l_next_free_rec) - 1;
4178         rec = &el->l_recs[i];
4179
4180         if (cpos >=
4181             (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4182                 goto set_tail_append;
4183
4184         return;
4185
4186 set_tail_append:
4187         insert->ins_appending = APPEND_TAIL;
4188 }
4189
4190 /*
4191  * Helper function called at the begining of an insert.
4192  *
4193  * This computes a few things that are commonly used in the process of
4194  * inserting into the btree:
4195  *   - Whether the new extent is contiguous with an existing one.
4196  *   - The current tree depth.
4197  *   - Whether the insert is an appending one.
4198  *   - The total # of free records in the tree.
4199  *
4200  * All of the information is stored on the ocfs2_insert_type
4201  * structure.
4202  */
4203 static int ocfs2_figure_insert_type(struct inode *inode,
4204                                     struct ocfs2_extent_tree *et,
4205                                     struct buffer_head **last_eb_bh,
4206                                     struct ocfs2_extent_rec *insert_rec,
4207                                     int *free_records,
4208                                     struct ocfs2_insert_type *insert)
4209 {
4210         int ret;
4211         struct ocfs2_extent_block *eb;
4212         struct ocfs2_extent_list *el;
4213         struct ocfs2_path *path = NULL;
4214         struct buffer_head *bh = NULL;
4215
4216         insert->ins_split = SPLIT_NONE;
4217
4218         el = et->et_root_el;
4219         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4220
4221         if (el->l_tree_depth) {
4222                 /*
4223                  * If we have tree depth, we read in the
4224                  * rightmost extent block ahead of time as
4225                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
4226                  * may want it later.
4227                  */
4228                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4229                                        ocfs2_et_get_last_eb_blk(et), &bh,
4230                                        OCFS2_BH_CACHED, inode);
4231                 if (ret) {
4232                         mlog_exit(ret);
4233                         goto out;
4234                 }
4235                 eb = (struct ocfs2_extent_block *) bh->b_data;
4236                 el = &eb->h_list;
4237         }
4238
4239         /*
4240          * Unless we have a contiguous insert, we'll need to know if
4241          * there is room left in our allocation tree for another
4242          * extent record.
4243          *
4244          * XXX: This test is simplistic, we can search for empty
4245          * extent records too.
4246          */
4247         *free_records = le16_to_cpu(el->l_count) -
4248                 le16_to_cpu(el->l_next_free_rec);
4249
4250         if (!insert->ins_tree_depth) {
4251                 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4252                 ocfs2_figure_appending_type(insert, el, insert_rec);
4253                 return 0;
4254         }
4255
4256         path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4257         if (!path) {
4258                 ret = -ENOMEM;
4259                 mlog_errno(ret);
4260                 goto out;
4261         }
4262
4263         /*
4264          * In the case that we're inserting past what the tree
4265          * currently accounts for, ocfs2_find_path() will return for
4266          * us the rightmost tree path. This is accounted for below in
4267          * the appending code.
4268          */
4269         ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4270         if (ret) {
4271                 mlog_errno(ret);
4272                 goto out;
4273         }
4274
4275         el = path_leaf_el(path);
4276
4277         /*
4278          * Now that we have the path, there's two things we want to determine:
4279          * 1) Contiguousness (also set contig_index if this is so)
4280          *
4281          * 2) Are we doing an append? We can trivially break this up
4282          *     into two types of appends: simple record append, or a
4283          *     rotate inside the tail leaf.
4284          */
4285         ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4286
4287         /*
4288          * The insert code isn't quite ready to deal with all cases of
4289          * left contiguousness. Specifically, if it's an insert into
4290          * the 1st record in a leaf, it will require the adjustment of
4291          * cluster count on the last record of the path directly to it's
4292          * left. For now, just catch that case and fool the layers
4293          * above us. This works just fine for tree_depth == 0, which
4294          * is why we allow that above.
4295          */
4296         if (insert->ins_contig == CONTIG_LEFT &&
4297             insert->ins_contig_index == 0)
4298                 insert->ins_contig = CONTIG_NONE;
4299
4300         /*
4301          * Ok, so we can simply compare against last_eb to figure out
4302          * whether the path doesn't exist. This will only happen in
4303          * the case that we're doing a tail append, so maybe we can
4304          * take advantage of that information somehow.
4305          */
4306         if (ocfs2_et_get_last_eb_blk(et) ==
4307             path_leaf_bh(path)->b_blocknr) {
4308                 /*
4309                  * Ok, ocfs2_find_path() returned us the rightmost
4310                  * tree path. This might be an appending insert. There are
4311                  * two cases:
4312                  *    1) We're doing a true append at the tail:
4313                  *      -This might even be off the end of the leaf
4314                  *    2) We're "appending" by rotating in the tail
4315                  */
4316                 ocfs2_figure_appending_type(insert, el, insert_rec);
4317         }
4318
4319 out:
4320         ocfs2_free_path(path);
4321
4322         if (ret == 0)
4323                 *last_eb_bh = bh;
4324         else
4325                 brelse(bh);
4326         return ret;
4327 }
4328
4329 /*
4330  * Insert an extent into an inode btree.
4331  *
4332  * The caller needs to update fe->i_clusters
4333  */
4334 static int ocfs2_insert_extent(struct ocfs2_super *osb,
4335                                handle_t *handle,
4336                                struct inode *inode,
4337                                struct buffer_head *root_bh,
4338                                u32 cpos,
4339                                u64 start_blk,
4340                                u32 new_clusters,
4341                                u8 flags,
4342                                struct ocfs2_alloc_context *meta_ac,
4343                                struct ocfs2_extent_tree *et)
4344 {
4345         int status;
4346         int uninitialized_var(free_records);
4347         struct buffer_head *last_eb_bh = NULL;
4348         struct ocfs2_insert_type insert = {0, };
4349         struct ocfs2_extent_rec rec;
4350
4351         BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
4352
4353         mlog(0, "add %u clusters at position %u to inode %llu\n",
4354              new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4355
4356         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
4357                         (OCFS2_I(inode)->ip_clusters != cpos),
4358                         "Device %s, asking for sparse allocation: inode %llu, "
4359                         "cpos %u, clusters %u\n",
4360                         osb->dev_str,
4361                         (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
4362                         OCFS2_I(inode)->ip_clusters);
4363
4364         memset(&rec, 0, sizeof(rec));
4365         rec.e_cpos = cpu_to_le32(cpos);
4366         rec.e_blkno = cpu_to_le64(start_blk);
4367         rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4368         rec.e_flags = flags;
4369
4370         status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
4371                                           &free_records, &insert);
4372         if (status < 0) {
4373                 mlog_errno(status);
4374                 goto bail;
4375         }
4376
4377         mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4378              "Insert.contig_index: %d, Insert.free_records: %d, "
4379              "Insert.tree_depth: %d\n",
4380              insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4381              free_records, insert.ins_tree_depth);
4382
4383         if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4384                 status = ocfs2_grow_tree(inode, handle, et,
4385                                          &insert.ins_tree_depth, &last_eb_bh,
4386                                          meta_ac);
4387                 if (status) {
4388                         mlog_errno(status);
4389                         goto bail;
4390                 }
4391         }
4392
4393         /* Finally, we can add clusters. This might rotate the tree for us. */
4394         status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
4395         if (status < 0)
4396                 mlog_errno(status);
4397         else if (et->et_type == OCFS2_DINODE_EXTENT)
4398                 ocfs2_extent_map_insert_rec(inode, &rec);
4399
4400 bail:
4401         if (last_eb_bh)
4402                 brelse(last_eb_bh);
4403
4404         mlog_exit(status);
4405         return status;
4406 }
4407
4408 int ocfs2_dinode_insert_extent(struct ocfs2_super *osb,
4409                                handle_t *handle,
4410                                struct inode *inode,
4411                                struct buffer_head *root_bh,
4412                                u32 cpos,
4413                                u64 start_blk,
4414                                u32 new_clusters,
4415                                u8 flags,
4416                                struct ocfs2_alloc_context *meta_ac)
4417 {
4418         int status;
4419         struct ocfs2_extent_tree et;
4420
4421         ocfs2_get_extent_tree(&et, inode, root_bh, OCFS2_DINODE_EXTENT,
4422                               NULL);
4423         status = ocfs2_insert_extent(osb, handle, inode, root_bh,
4424                                      cpos, start_blk, new_clusters,
4425                                      flags, meta_ac, &et);
4426         ocfs2_put_extent_tree(&et);
4427
4428         return status;
4429 }
4430
4431 int ocfs2_xattr_value_insert_extent(struct ocfs2_super *osb,
4432                                     handle_t *handle,
4433                                     struct inode *inode,
4434                                     struct buffer_head *root_bh,
4435                                     u32 cpos,
4436                                     u64 start_blk,
4437                                     u32 new_clusters,
4438                                     u8 flags,
4439                                     struct ocfs2_alloc_context *meta_ac,
4440                                     void *obj)
4441 {
4442         int status;
4443         struct ocfs2_extent_tree et;
4444
4445         ocfs2_get_extent_tree(&et, inode, root_bh,
4446                               OCFS2_XATTR_VALUE_EXTENT, obj);
4447         status = ocfs2_insert_extent(osb, handle, inode, root_bh,
4448                                      cpos, start_blk, new_clusters,
4449                                      flags, meta_ac, &et);
4450         ocfs2_put_extent_tree(&et);
4451
4452         return status;
4453 }
4454
4455 int ocfs2_xattr_tree_insert_extent(struct ocfs2_super *osb,
4456                                    handle_t *handle,
4457                                    struct inode *inode,
4458                                    struct buffer_head *root_bh,
4459                                    u32 cpos,
4460                                    u64 start_blk,
4461                                    u32 new_clusters,
4462                                    u8 flags,
4463                                    struct ocfs2_alloc_context *meta_ac)
4464 {
4465         int status;
4466         struct ocfs2_extent_tree et;
4467
4468         ocfs2_get_extent_tree(&et, inode, root_bh, OCFS2_XATTR_TREE_EXTENT,
4469                               NULL);
4470         status = ocfs2_insert_extent(osb, handle, inode, root_bh,
4471                                      cpos, start_blk, new_clusters,
4472                                      flags, meta_ac, &et);
4473         ocfs2_put_extent_tree(&et);
4474
4475         return status;
4476 }
4477
4478 /*
4479  * Allcate and add clusters into the extent b-tree.
4480  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4481  * The extent b-tree's root is root_el and it should be in root_bh, and
4482  * it is not limited to the file storage. Any extent tree can use this
4483  * function if it implements the proper ocfs2_extent_tree.
4484  */
4485 int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4486                                 struct inode *inode,
4487                                 u32 *logical_offset,
4488                                 u32 clusters_to_add,
4489                                 int mark_unwritten,
4490                                 struct buffer_head *root_bh,
4491                                 struct ocfs2_extent_list *root_el,
4492                                 handle_t *handle,
4493                                 struct ocfs2_alloc_context *data_ac,
4494                                 struct ocfs2_alloc_context *meta_ac,
4495                                 enum ocfs2_alloc_restarted *reason_ret,
4496                                 enum ocfs2_extent_tree_type type,
4497                                 void *obj)
4498 {
4499         int status = 0;
4500         int free_extents;
4501         enum ocfs2_alloc_restarted reason = RESTART_NONE;
4502         u32 bit_off, num_bits;
4503         u64 block;
4504         u8 flags = 0;
4505
4506         BUG_ON(!clusters_to_add);
4507
4508         if (mark_unwritten)
4509                 flags = OCFS2_EXT_UNWRITTEN;
4510
4511         free_extents = ocfs2_num_free_extents(osb, inode, root_bh, type,
4512                                               obj);
4513         if (free_extents < 0) {
4514                 status = free_extents;
4515                 mlog_errno(status);
4516                 goto leave;
4517         }
4518
4519         /* there are two cases which could cause us to EAGAIN in the
4520          * we-need-more-metadata case:
4521          * 1) we haven't reserved *any*
4522          * 2) we are so fragmented, we've needed to add metadata too
4523          *    many times. */
4524         if (!free_extents && !meta_ac) {
4525                 mlog(0, "we haven't reserved any metadata!\n");
4526                 status = -EAGAIN;
4527                 reason = RESTART_META;
4528                 goto leave;
4529         } else if ((!free_extents)
4530                    && (ocfs2_alloc_context_bits_left(meta_ac)
4531                        < ocfs2_extend_meta_needed(root_el))) {
4532                 mlog(0, "filesystem is really fragmented...\n");
4533                 status = -EAGAIN;
4534                 reason = RESTART_META;
4535                 goto leave;
4536         }
4537
4538         status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4539                                         clusters_to_add, &bit_off, &num_bits);
4540         if (status < 0) {
4541                 if (status != -ENOSPC)
4542                         mlog_errno(status);
4543                 goto leave;
4544         }
4545
4546         BUG_ON(num_bits > clusters_to_add);
4547
4548         /* reserve our write early -- insert_extent may update the inode */
4549         status = ocfs2_journal_access(handle, inode, root_bh,
4550                                       OCFS2_JOURNAL_ACCESS_WRITE);
4551         if (status < 0) {
4552                 mlog_errno(status);
4553                 goto leave;
4554         }
4555
4556         block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4557         mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
4558              num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4559         if (type == OCFS2_DINODE_EXTENT)
4560                 status = ocfs2_dinode_insert_extent(osb, handle, inode, root_bh,
4561                                                     *logical_offset, block,
4562                                                     num_bits, flags, meta_ac);
4563         else if (type == OCFS2_XATTR_TREE_EXTENT)
4564                 status = ocfs2_xattr_tree_insert_extent(osb, handle,
4565                                                         inode, root_bh,
4566                                                         *logical_offset,
4567                                                         block, num_bits, flags,
4568                                                         meta_ac);
4569         else
4570                 status = ocfs2_xattr_value_insert_extent(osb, handle,
4571                                                          inode, root_bh,
4572                                                          *logical_offset,
4573                                                          block, num_bits, flags,
4574                                                          meta_ac, obj);
4575         if (status < 0) {
4576                 mlog_errno(status);
4577                 goto leave;
4578         }
4579
4580         status = ocfs2_journal_dirty(handle, root_bh);
4581         if (status < 0) {
4582                 mlog_errno(status);
4583                 goto leave;
4584         }
4585
4586         clusters_to_add -= num_bits;
4587         *logical_offset += num_bits;
4588
4589         if (clusters_to_add) {
4590                 mlog(0, "need to alloc once more, wanted = %u\n",
4591                      clusters_to_add);
4592                 status = -EAGAIN;
4593                 reason = RESTART_TRANS;
4594         }
4595
4596 leave:
4597         mlog_exit(status);
4598         if (reason_ret)
4599                 *reason_ret = reason;
4600         return status;
4601 }
4602
4603 static void ocfs2_make_right_split_rec(struct super_block *sb,
4604                                        struct ocfs2_extent_rec *split_rec,
4605                                        u32 cpos,
4606                                        struct ocfs2_extent_rec *rec)
4607 {
4608         u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4609         u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4610
4611         memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4612
4613         split_rec->e_cpos = cpu_to_le32(cpos);
4614         split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4615
4616         split_rec->e_blkno = rec->e_blkno;
4617         le64_add_cpu(&split_rec->e_blkno,
4618                      ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4619
4620         split_rec->e_flags = rec->e_flags;
4621 }
4622
4623 static int ocfs2_split_and_insert(struct inode *inode,
4624                                   handle_t *handle,
4625                                   struct ocfs2_path *path,
4626                                   struct ocfs2_extent_tree *et,
4627                                   struct buffer_head **last_eb_bh,
4628                                   int split_index,
4629                                   struct ocfs2_extent_rec *orig_split_rec,
4630                                   struct ocfs2_alloc_context *meta_ac)
4631 {
4632         int ret = 0, depth;
4633         unsigned int insert_range, rec_range, do_leftright = 0;
4634         struct ocfs2_extent_rec tmprec;
4635         struct ocfs2_extent_list *rightmost_el;
4636         struct ocfs2_extent_rec rec;
4637         struct ocfs2_extent_rec split_rec = *orig_split_rec;
4638         struct ocfs2_insert_type insert;
4639         struct ocfs2_extent_block *eb;
4640
4641 leftright:
4642         /*
4643          * Store a copy of the record on the stack - it might move
4644          * around as the tree is manipulated below.
4645          */
4646         rec = path_leaf_el(path)->l_recs[split_index];
4647
4648         rightmost_el = et->et_root_el;
4649
4650         depth = le16_to_cpu(rightmost_el->l_tree_depth);
4651         if (depth) {
4652                 BUG_ON(!(*last_eb_bh));
4653                 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4654                 rightmost_el = &eb->h_list;
4655         }
4656
4657         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4658             le16_to_cpu(rightmost_el->l_count)) {
4659                 ret = ocfs2_grow_tree(inode, handle, et,
4660                                       &depth, last_eb_bh, meta_ac);
4661                 if (ret) {
4662                         mlog_errno(ret);
4663                         goto out;
4664                 }
4665         }
4666
4667         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4668         insert.ins_appending = APPEND_NONE;
4669         insert.ins_contig = CONTIG_NONE;
4670         insert.ins_tree_depth = depth;
4671
4672         insert_range = le32_to_cpu(split_rec.e_cpos) +
4673                 le16_to_cpu(split_rec.e_leaf_clusters);
4674         rec_range = le32_to_cpu(rec.e_cpos) +
4675                 le16_to_cpu(rec.e_leaf_clusters);
4676
4677         if (split_rec.e_cpos == rec.e_cpos) {
4678                 insert.ins_split = SPLIT_LEFT;
4679         } else if (insert_range == rec_range) {
4680                 insert.ins_split = SPLIT_RIGHT;
4681         } else {
4682                 /*
4683                  * Left/right split. We fake this as a right split
4684                  * first and then make a second pass as a left split.
4685                  */
4686                 insert.ins_split = SPLIT_RIGHT;
4687
4688                 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4689                                            &rec);
4690
4691                 split_rec = tmprec;
4692
4693                 BUG_ON(do_leftright);
4694                 do_leftright = 1;
4695         }
4696
4697         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4698         if (ret) {
4699                 mlog_errno(ret);
4700                 goto out;
4701         }
4702
4703         if (do_leftright == 1) {
4704                 u32 cpos;
4705                 struct ocfs2_extent_list *el;
4706
4707                 do_leftright++;
4708                 split_rec = *orig_split_rec;
4709
4710                 ocfs2_reinit_path(path, 1);
4711
4712                 cpos = le32_to_cpu(split_rec.e_cpos);
4713                 ret = ocfs2_find_path(inode, path, cpos);
4714                 if (ret) {
4715                         mlog_errno(ret);
4716                         goto out;
4717                 }
4718
4719                 el = path_leaf_el(path);
4720                 split_index = ocfs2_search_extent_list(el, cpos);
4721                 goto leftright;
4722         }
4723 out:
4724
4725         return ret;
4726 }
4727
4728 /*
4729  * Mark part or all of the extent record at split_index in the leaf
4730  * pointed to by path as written. This removes the unwritten
4731  * extent flag.
4732  *
4733  * Care is taken to handle contiguousness so as to not grow the tree.
4734  *
4735  * meta_ac is not strictly necessary - we only truly need it if growth
4736  * of the tree is required. All other cases will degrade into a less
4737  * optimal tree layout.
4738  *
4739  * last_eb_bh should be the rightmost leaf block for any extent
4740  * btree. Since a split may grow the tree or a merge might shrink it,
4741  * the caller cannot trust the contents of that buffer after this call.
4742  *
4743  * This code is optimized for readability - several passes might be
4744  * made over certain portions of the tree. All of those blocks will
4745  * have been brought into cache (and pinned via the journal), so the
4746  * extra overhead is not expressed in terms of disk reads.
4747  */
4748 static int __ocfs2_mark_extent_written(struct inode *inode,
4749                                        struct ocfs2_extent_tree *et,
4750                                        handle_t *handle,
4751                                        struct ocfs2_path *path,
4752                                        int split_index,
4753                                        struct ocfs2_extent_rec *split_rec,
4754                                        struct ocfs2_alloc_context *meta_ac,
4755                                        struct ocfs2_cached_dealloc_ctxt *dealloc)
4756 {
4757         int ret = 0;
4758         struct ocfs2_extent_list *el = path_leaf_el(path);
4759         struct buffer_head *last_eb_bh = NULL;
4760         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
4761         struct ocfs2_merge_ctxt ctxt;
4762         struct ocfs2_extent_list *rightmost_el;
4763
4764         if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
4765                 ret = -EIO;
4766                 mlog_errno(ret);
4767                 goto out;
4768         }
4769
4770         if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
4771             ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
4772              (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
4773                 ret = -EIO;
4774                 mlog_errno(ret);
4775                 goto out;
4776         }
4777
4778         ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
4779                                                             split_index,
4780                                                             split_rec);
4781
4782         /*
4783          * The core merge / split code wants to know how much room is
4784          * left in this inodes allocation tree, so we pass the
4785          * rightmost extent list.
4786          */
4787         if (path->p_tree_depth) {
4788                 struct ocfs2_extent_block *eb;
4789
4790                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4791                                        ocfs2_et_get_last_eb_blk(et),
4792                                        &last_eb_bh, OCFS2_BH_CACHED, inode);
4793                 if (ret) {
4794                         mlog_exit(ret);
4795                         goto out;
4796                 }
4797
4798                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4799                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4800                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4801                         ret = -EROFS;
4802                         goto out;
4803                 }
4804
4805                 rightmost_el = &eb->h_list;
4806         } else
4807                 rightmost_el = path_root_el(path);
4808
4809         if (rec->e_cpos == split_rec->e_cpos &&
4810             rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4811                 ctxt.c_split_covers_rec = 1;
4812         else
4813                 ctxt.c_split_covers_rec = 0;
4814
4815         ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4816
4817         mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4818              split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4819              ctxt.c_split_covers_rec);
4820
4821         if (ctxt.c_contig_type == CONTIG_NONE) {
4822                 if (ctxt.c_split_covers_rec)
4823                         el->l_recs[split_index] = *split_rec;
4824                 else
4825                         ret = ocfs2_split_and_insert(inode, handle, path, et,
4826                                                      &last_eb_bh, split_index,
4827                                                      split_rec, meta_ac);
4828                 if (ret)
4829                         mlog_errno(ret);
4830         } else {
4831                 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4832                                                 split_index, split_rec,
4833                                                 dealloc, &ctxt, et);
4834                 if (ret)
4835                         mlog_errno(ret);
4836         }
4837
4838 out:
4839         brelse(last_eb_bh);
4840         return ret;
4841 }
4842
4843 /*
4844  * Mark the already-existing extent at cpos as written for len clusters.
4845  *
4846  * If the existing extent is larger than the request, initiate a
4847  * split. An attempt will be made at merging with adjacent extents.
4848  *
4849  * The caller is responsible for passing down meta_ac if we'll need it.
4850  */
4851 int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
4852                               handle_t *handle, u32 cpos, u32 len, u32 phys,
4853                               struct ocfs2_alloc_context *meta_ac,
4854                               struct ocfs2_cached_dealloc_ctxt *dealloc,
4855                               enum ocfs2_extent_tree_type et_type,
4856                               void *obj)
4857 {
4858         int ret, index;
4859         u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4860         struct ocfs2_extent_rec split_rec;
4861         struct ocfs2_path *left_path = NULL;
4862         struct ocfs2_extent_list *el;
4863         struct ocfs2_extent_tree et;
4864
4865         mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4866              inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4867
4868         ocfs2_get_extent_tree(&et, inode, root_bh, et_type, obj);
4869
4870         if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4871                 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4872                             "that are being written to, but the feature bit "
4873                             "is not set in the super block.",
4874                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
4875                 ret = -EROFS;
4876                 goto out;
4877         }
4878
4879         /*
4880          * XXX: This should be fixed up so that we just re-insert the
4881          * next extent records.
4882          */
4883         if (et_type == OCFS2_DINODE_EXTENT)
4884                 ocfs2_extent_map_trunc(inode, 0);
4885
4886         left_path = ocfs2_new_path(et.et_root_bh, et.et_root_el);
4887         if (!left_path) {
4888                 ret = -ENOMEM;
4889                 mlog_errno(ret);
4890                 goto out;
4891         }
4892
4893         ret = ocfs2_find_path(inode, left_path, cpos);
4894         if (ret) {
4895                 mlog_errno(ret);
4896                 goto out;
4897         }
4898         el = path_leaf_el(left_path);
4899
4900         index = ocfs2_search_extent_list(el, cpos);
4901         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4902                 ocfs2_error(inode->i_sb,
4903                             "Inode %llu has an extent at cpos %u which can no "
4904                             "longer be found.\n",
4905                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4906                 ret = -EROFS;
4907                 goto out;
4908         }
4909
4910         memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4911         split_rec.e_cpos = cpu_to_le32(cpos);
4912         split_rec.e_leaf_clusters = cpu_to_le16(len);
4913         split_rec.e_blkno = cpu_to_le64(start_blkno);
4914         split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4915         split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4916
4917         ret = __ocfs2_mark_extent_written(inode, &et, handle, left_path,
4918                                           index, &split_rec, meta_ac,
4919                                           dealloc);
4920         if (ret)
4921                 mlog_errno(ret);
4922
4923 out:
4924         ocfs2_free_path(left_path);
4925         ocfs2_put_extent_tree(&et);
4926         return ret;
4927 }
4928
4929 static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
4930                             handle_t *handle, struct ocfs2_path *path,
4931                             int index, u32 new_range,
4932                             struct ocfs2_alloc_context *meta_ac)
4933 {
4934         int ret, depth, credits = handle->h_buffer_credits;
4935         struct buffer_head *last_eb_bh = NULL;
4936         struct ocfs2_extent_block *eb;
4937         struct ocfs2_extent_list *rightmost_el, *el;
4938         struct ocfs2_extent_rec split_rec;
4939         struct ocfs2_extent_rec *rec;
4940         struct ocfs2_insert_type insert;
4941
4942         /*
4943          * Setup the record to split before we grow the tree.
4944          */
4945         el = path_leaf_el(path);
4946         rec = &el->l_recs[index];
4947         ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4948
4949         depth = path->p_tree_depth;
4950         if (depth > 0) {
4951                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4952                                        ocfs2_et_get_last_eb_blk(et),
4953                                        &last_eb_bh, OCFS2_BH_CACHED, inode);
4954                 if (ret < 0) {
4955                         mlog_errno(ret);
4956                         goto out;
4957                 }
4958
4959                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4960                 rightmost_el = &eb->h_list;
4961         } else
4962                 rightmost_el = path_leaf_el(path);
4963
4964         credits += path->p_tree_depth +
4965                    ocfs2_extend_meta_needed(et->et_root_el);
4966         ret = ocfs2_extend_trans(handle, credits);
4967         if (ret) {
4968                 mlog_errno(ret);
4969                 goto out;
4970         }
4971
4972         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4973             le16_to_cpu(rightmost_el->l_count)) {
4974                 ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
4975                                       meta_ac);
4976                 if (ret) {
4977                         mlog_errno(ret);
4978                         goto out;
4979                 }
4980         }
4981
4982         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4983         insert.ins_appending = APPEND_NONE;
4984         insert.ins_contig = CONTIG_NONE;
4985         insert.ins_split = SPLIT_RIGHT;
4986         insert.ins_tree_depth = depth;
4987
4988         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4989         if (ret)
4990                 mlog_errno(ret);
4991
4992 out:
4993         brelse(last_eb_bh);
4994         return ret;
4995 }
4996
4997 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4998                               struct ocfs2_path *path, int index,
4999                               struct ocfs2_cached_dealloc_ctxt *dealloc,
5000                               u32 cpos, u32 len,
5001                               struct ocfs2_extent_tree *et)
5002 {
5003         int ret;
5004         u32 left_cpos, rec_range, trunc_range;
5005         int wants_rotate = 0, is_rightmost_tree_rec = 0;
5006         struct super_block *sb = inode->i_sb;
5007         struct ocfs2_path *left_path = NULL;
5008         struct ocfs2_extent_list *el = path_leaf_el(path);
5009         struct ocfs2_extent_rec *rec;
5010         struct ocfs2_extent_block *eb;
5011
5012         if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5013                 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5014                 if (ret) {
5015                         mlog_errno(ret);
5016                         goto out;
5017                 }
5018
5019                 index--;
5020         }
5021
5022         if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5023             path->p_tree_depth) {
5024                 /*
5025                  * Check whether this is the rightmost tree record. If
5026                  * we remove all of this record or part of its right
5027                  * edge then an update of the record lengths above it
5028                  * will be required.
5029                  */
5030                 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5031                 if (eb->h_next_leaf_blk == 0)
5032                         is_rightmost_tree_rec = 1;
5033         }
5034
5035         rec = &el->l_recs[index];
5036         if (index == 0 && path->p_tree_depth &&
5037             le32_to_cpu(rec->e_cpos) == cpos) {
5038                 /*
5039                  * Changing the leftmost offset (via partial or whole
5040                  * record truncate) of an interior (or rightmost) path
5041                  * means we have to update the subtree that is formed
5042                  * by this leaf and the one to it's left.
5043                  *
5044                  * There are two cases we can skip:
5045                  *   1) Path is the leftmost one in our inode tree.
5046                  *   2) The leaf is rightmost and will be empty after
5047                  *      we remove the extent record - the rotate code
5048                  *      knows how to update the newly formed edge.
5049                  */
5050
5051                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
5052                                                     &left_cpos);
5053                 if (ret) {
5054                         mlog_errno(ret);
5055                         goto out;
5056                 }
5057
5058                 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5059                         left_path = ocfs2_new_path(path_root_bh(path),
5060                                                    path_root_el(path));
5061                         if (!left_path) {
5062                                 ret = -ENOMEM;
5063                                 mlog_errno(ret);
5064                                 goto out;
5065                         }
5066
5067                         ret = ocfs2_find_path(inode, left_path, left_cpos);
5068                         if (ret) {
5069                                 mlog_errno(ret);
5070                                 goto out;
5071                         }
5072                 }
5073         }
5074
5075         ret = ocfs2_extend_rotate_transaction(handle, 0,
5076                                               handle->h_buffer_credits,
5077                                               path);
5078         if (ret) {
5079                 mlog_errno(ret);
5080                 goto out;
5081         }
5082
5083         ret = ocfs2_journal_access_path(inode, handle, path);
5084         if (ret) {
5085                 mlog_errno(ret);
5086                 goto out;
5087         }
5088
5089         ret = ocfs2_journal_access_path(inode, handle, left_path);
5090         if (ret) {
5091                 mlog_errno(ret);
5092                 goto out;
5093         }
5094
5095         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5096         trunc_range = cpos + len;
5097
5098         if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5099                 int next_free;
5100
5101                 memset(rec, 0, sizeof(*rec));
5102                 ocfs2_cleanup_merge(el, index);
5103                 wants_rotate = 1;
5104
5105                 next_free = le16_to_cpu(el->l_next_free_rec);
5106                 if (is_rightmost_tree_rec && next_free > 1) {
5107                         /*
5108                          * We skip the edge update if this path will
5109                          * be deleted by the rotate code.
5110                          */
5111                         rec = &el->l_recs[next_free - 1];
5112                         ocfs2_adjust_rightmost_records(inode, handle, path,
5113                                                        rec);
5114                 }
5115         } else if (le32_to_cpu(rec->e_cpos) == cpos) {
5116                 /* Remove leftmost portion of the record. */
5117                 le32_add_cpu(&rec->e_cpos, len);
5118                 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5119                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5120         } else if (rec_range == trunc_range) {
5121                 /* Remove rightmost portion of the record */
5122                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5123                 if (is_rightmost_tree_rec)
5124                         ocfs2_adjust_rightmost_records(inode, handle, path, rec);
5125         } else {
5126                 /* Caller should have trapped this. */
5127                 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
5128                      "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
5129                      le32_to_cpu(rec->e_cpos),
5130                      le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5131                 BUG();
5132         }
5133
5134         if (left_path) {
5135                 int subtree_index;
5136
5137                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
5138                 ocfs2_complete_edge_insert(inode, handle, left_path, path,
5139                                            subtree_index);
5140         }
5141
5142         ocfs2_journal_dirty(handle, path_leaf_bh(path));
5143
5144         ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5145         if (ret) {
5146                 mlog_errno(ret);
5147                 goto out;
5148         }
5149
5150 out:
5151         ocfs2_free_path(left_path);
5152         return ret;
5153 }
5154
5155 int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
5156                         u32 cpos, u32 len, handle_t *handle,
5157                         struct ocfs2_alloc_context *meta_ac,
5158                         struct ocfs2_cached_dealloc_ctxt *dealloc,
5159                         enum ocfs2_extent_tree_type et_type,
5160                         void *obj)
5161 {
5162         int ret, index;
5163         u32 rec_range, trunc_range;
5164         struct ocfs2_extent_rec *rec;
5165         struct ocfs2_extent_list *el;
5166         struct ocfs2_path *path = NULL;
5167         struct ocfs2_extent_tree et;
5168
5169         ocfs2_get_extent_tree(&et, inode, root_bh, et_type, obj);
5170
5171         ocfs2_extent_map_trunc(inode, 0);
5172
5173         path = ocfs2_new_path(et.et_root_bh, et.et_root_el);
5174         if (!path) {
5175                 ret = -ENOMEM;
5176                 mlog_errno(ret);
5177                 goto out;
5178         }
5179
5180         ret = ocfs2_find_path(inode, path, cpos);
5181         if (ret) {
5182                 mlog_errno(ret);
5183                 goto out;
5184         }
5185
5186         el = path_leaf_el(path);
5187         index = ocfs2_search_extent_list(el, cpos);
5188         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5189                 ocfs2_error(inode->i_sb,
5190                             "Inode %llu has an extent at cpos %u which can no "
5191                             "longer be found.\n",
5192                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
5193                 ret = -EROFS;
5194                 goto out;
5195         }
5196
5197         /*
5198          * We have 3 cases of extent removal:
5199          *   1) Range covers the entire extent rec
5200          *   2) Range begins or ends on one edge of the extent rec
5201          *   3) Range is in the middle of the extent rec (no shared edges)
5202          *
5203          * For case 1 we remove the extent rec and left rotate to
5204          * fill the hole.
5205          *
5206          * For case 2 we just shrink the existing extent rec, with a
5207          * tree update if the shrinking edge is also the edge of an
5208          * extent block.
5209          *
5210          * For case 3 we do a right split to turn the extent rec into
5211          * something case 2 can handle.
5212          */
5213         rec = &el->l_recs[index];
5214         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5215         trunc_range = cpos + len;
5216
5217         BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5218
5219         mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
5220              "(cpos %u, len %u)\n",
5221              (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
5222              le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5223
5224         if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5225                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5226                                          cpos, len, &et);
5227                 if (ret) {
5228                         mlog_errno(ret);
5229                         goto out;
5230                 }
5231         } else {
5232                 ret = ocfs2_split_tree(inode, &et, handle, path, index,
5233                                        trunc_range, meta_ac);
5234                 if (ret) {
5235                         mlog_errno(ret);
5236                         goto out;
5237                 }
5238
5239                 /*
5240                  * The split could have manipulated the tree enough to
5241                  * move the record location, so we have to look for it again.
5242                  */
5243                 ocfs2_reinit_path(path, 1);
5244
5245                 ret = ocfs2_find_path(inode, path, cpos);
5246                 if (ret) {
5247                         mlog_errno(ret);
5248                         goto out;
5249                 }
5250
5251                 el = path_leaf_el(path);
5252                 index = ocfs2_search_extent_list(el, cpos);
5253                 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5254                         ocfs2_error(inode->i_sb,
5255                                     "Inode %llu: split at cpos %u lost record.",
5256                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5257                                     cpos);
5258                         ret = -EROFS;
5259                         goto out;
5260                 }
5261
5262                 /*
5263                  * Double check our values here. If anything is fishy,
5264                  * it's easier to catch it at the top level.
5265                  */
5266                 rec = &el->l_recs[index];
5267                 rec_range = le32_to_cpu(rec->e_cpos) +
5268                         ocfs2_rec_clusters(el, rec);
5269                 if (rec_range != trunc_range) {
5270                         ocfs2_error(inode->i_sb,
5271                                     "Inode %llu: error after split at cpos %u"
5272                                     "trunc len %u, existing record is (%u,%u)",
5273                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5274                                     cpos, len, le32_to_cpu(rec->e_cpos),
5275                                     ocfs2_rec_clusters(el, rec));
5276                         ret = -EROFS;
5277                         goto out;
5278                 }
5279
5280                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5281                                          cpos, len, &et);
5282                 if (ret) {
5283                         mlog_errno(ret);
5284                         goto out;
5285                 }
5286         }
5287
5288 out:
5289         ocfs2_free_path(path);
5290         ocfs2_put_extent_tree(&et);
5291         return ret;
5292 }
5293
5294 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5295 {
5296         struct buffer_head *tl_bh = osb->osb_tl_bh;
5297         struct ocfs2_dinode *di;
5298         struct ocfs2_truncate_log *tl;
5299
5300         di = (struct ocfs2_dinode *) tl_bh->b_data;
5301         tl = &di->id2.i_dealloc;
5302
5303         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5304                         "slot %d, invalid truncate log parameters: used = "
5305                         "%u, count = %u\n", osb->slot_num,
5306                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5307         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5308 }
5309
5310 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5311                                            unsigned int new_start)
5312 {
5313         unsigned int tail_index;
5314         unsigned int current_tail;
5315
5316         /* No records, nothing to coalesce */
5317         if (!le16_to_cpu(tl->tl_used))
5318                 return 0;
5319
5320         tail_index = le16_to_cpu(tl->tl_used) - 1;
5321         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5322         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5323
5324         return current_tail == new_start;
5325 }
5326
5327 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5328                               handle_t *handle,
5329                               u64 start_blk,
5330                               unsigned int num_clusters)
5331 {
5332         int status, index;
5333         unsigned int start_cluster, tl_count;
5334         struct inode *tl_inode = osb->osb_tl_inode;
5335         struct buffer_head *tl_bh = osb->osb_tl_bh;
5336         struct ocfs2_dinode *di;
5337         struct ocfs2_truncate_log *tl;
5338
5339         mlog_entry("start_blk = %llu, num_clusters = %u\n",
5340                    (unsigned long long)start_blk, num_clusters);
5341
5342         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5343
5344         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5345
5346         di = (struct ocfs2_dinode *) tl_bh->b_data;
5347         tl = &di->id2.i_dealloc;
5348         if (!OCFS2_IS_VALID_DINODE(di)) {
5349                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5350                 status = -EIO;
5351                 goto bail;
5352         }
5353
5354         tl_count = le16_to_cpu(tl->tl_count);
5355         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5356                         tl_count == 0,
5357                         "Truncate record count on #%llu invalid "
5358                         "wanted %u, actual %u\n",
5359                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5360                         ocfs2_truncate_recs_per_inode(osb->sb),
5361                         le16_to_cpu(tl->tl_count));
5362
5363         /* Caller should have known to flush before calling us. */
5364         index = le16_to_cpu(tl->tl_used);
5365         if (index >= tl_count) {
5366                 status = -ENOSPC;
5367                 mlog_errno(status);
5368                 goto bail;
5369         }
5370
5371         status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5372                                       OCFS2_JOURNAL_ACCESS_WRITE);
5373         if (status < 0) {
5374                 mlog_errno(status);
5375                 goto bail;
5376         }
5377
5378         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5379              "%llu (index = %d)\n", num_clusters, start_cluster,
5380              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5381
5382         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5383                 /*
5384                  * Move index back to the record we are coalescing with.
5385                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5386                  */
5387                 index--;
5388
5389                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5390                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5391                      index, le32_to_cpu(tl->tl_recs[index].t_start),
5392                      num_clusters);
5393         } else {
5394                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5395                 tl->tl_used = cpu_to_le16(index + 1);
5396         }
5397         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5398
5399         status = ocfs2_journal_dirty(handle, tl_bh);
5400         if (status < 0) {
5401                 mlog_errno(status);
5402                 goto bail;
5403         }
5404
5405 bail:
5406         mlog_exit(status);
5407         return status;
5408 }
5409
5410 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5411                                          handle_t *handle,
5412                                          struct inode *data_alloc_inode,
5413                                          struct buffer_head *data_alloc_bh)
5414 {
5415         int status = 0;
5416         int i;
5417         unsigned int num_clusters;
5418         u64 start_blk;
5419         struct ocfs2_truncate_rec rec;
5420         struct ocfs2_dinode *di;
5421         struct ocfs2_truncate_log *tl;
5422         struct inode *tl_inode = osb->osb_tl_inode;
5423         struct buffer_head *tl_bh = osb->osb_tl_bh;
5424
5425         mlog_entry_void();
5426
5427         di = (struct ocfs2_dinode *) tl_bh->b_data;
5428         tl = &di->id2.i_dealloc;
5429         i = le16_to_cpu(tl->tl_used) - 1;
5430         while (i >= 0) {
5431                 /* Caller has given us at least enough credits to
5432                  * update the truncate log dinode */
5433                 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5434                                               OCFS2_JOURNAL_ACCESS_WRITE);
5435                 if (status < 0) {
5436                         mlog_errno(status);
5437                         goto bail;
5438                 }
5439
5440                 tl->tl_used = cpu_to_le16(i);
5441
5442                 status = ocfs2_journal_dirty(handle, tl_bh);
5443                 if (status < 0) {
5444                         mlog_errno(status);
5445                         goto bail;
5446                 }
5447
5448                 /* TODO: Perhaps we can calculate the bulk of the
5449                  * credits up front rather than extending like
5450                  * this. */
5451                 status = ocfs2_extend_trans(handle,
5452                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5453                 if (status < 0) {
5454                         mlog_errno(status);
5455                         goto bail;
5456                 }
5457
5458                 rec = tl->tl_recs[i];
5459                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5460                                                     le32_to_cpu(rec.t_start));
5461                 num_clusters = le32_to_cpu(rec.t_clusters);
5462
5463                 /* if start_blk is not set, we ignore the record as
5464                  * invalid. */
5465                 if (start_blk) {
5466                         mlog(0, "free record %d, start = %u, clusters = %u\n",
5467                              i, le32_to_cpu(rec.t_start), num_clusters);
5468
5469                         status = ocfs2_free_clusters(handle, data_alloc_inode,
5470                                                      data_alloc_bh, start_blk,
5471                                                      num_clusters);
5472                         if (status < 0) {
5473                                 mlog_errno(status);
5474                                 goto bail;
5475                         }
5476                 }
5477                 i--;
5478         }
5479
5480 bail:
5481         mlog_exit(status);
5482         return status;
5483 }
5484
5485 /* Expects you to already be holding tl_inode->i_mutex */
5486 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5487 {
5488         int status;
5489         unsigned int num_to_flush;
5490         handle_t *handle;
5491         struct inode *tl_inode = osb->osb_tl_inode;
5492         struct inode *data_alloc_inode = NULL;
5493         struct buffer_head *tl_bh = osb->osb_tl_bh;
5494         struct buffer_head *data_alloc_bh = NULL;
5495         struct ocfs2_dinode *di;
5496         struct ocfs2_truncate_log *tl;
5497
5498         mlog_entry_void();
5499
5500         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5501
5502         di = (struct ocfs2_dinode *) tl_bh->b_data;
5503         tl = &di->id2.i_dealloc;
5504         if (!OCFS2_IS_VALID_DINODE(di)) {
5505                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5506                 status = -EIO;
5507                 goto out;
5508         }
5509
5510         num_to_flush = le16_to_cpu(tl->tl_used);
5511         mlog(0, "Flush %u records from truncate log #%llu\n",
5512              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5513         if (!num_to_flush) {
5514                 status = 0;
5515                 goto out;
5516         }
5517
5518         data_alloc_inode = ocfs2_get_system_file_inode(osb,
5519                                                        GLOBAL_BITMAP_SYSTEM_INODE,
5520                                                        OCFS2_INVALID_SLOT);
5521         if (!data_alloc_inode) {
5522                 status = -EINVAL;
5523                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
5524                 goto out;
5525         }
5526
5527         mutex_lock(&data_alloc_inode->i_mutex);
5528
5529         status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5530         if (status < 0) {
5531                 mlog_errno(status);
5532                 goto out_mutex;
5533         }
5534
5535         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5536         if (IS_ERR(handle)) {
5537                 status = PTR_ERR(handle);
5538                 mlog_errno(status);
5539                 goto out_unlock;
5540         }
5541
5542         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5543                                                data_alloc_bh);
5544         if (status < 0)
5545                 mlog_errno(status);
5546
5547         ocfs2_commit_trans(osb, handle);
5548
5549 out_unlock:
5550         brelse(data_alloc_bh);
5551         ocfs2_inode_unlock(data_alloc_inode, 1);
5552
5553 out_mutex:
5554         mutex_unlock(&data_alloc_inode->i_mutex);
5555         iput(data_alloc_inode);
5556
5557 out:
5558         mlog_exit(status);
5559         return status;
5560 }
5561
5562 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5563 {
5564         int status;
5565         struct inode *tl_inode = osb->osb_tl_inode;
5566
5567         mutex_lock(&tl_inode->i_mutex);
5568         status = __ocfs2_flush_truncate_log(osb);
5569         mutex_unlock(&tl_inode->i_mutex);
5570
5571         return status;
5572 }
5573
5574 static void ocfs2_truncate_log_worker(struct work_struct *work)
5575 {
5576         int status;
5577         struct ocfs2_super *osb =
5578                 container_of(work, struct ocfs2_super,
5579                              osb_truncate_log_wq.work);
5580
5581         mlog_entry_void();
5582
5583         status = ocfs2_flush_truncate_log(osb);
5584         if (status < 0)
5585                 mlog_errno(status);
5586         else
5587                 ocfs2_init_inode_steal_slot(osb);
5588
5589         mlog_exit(status);
5590 }
5591
5592 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5593 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5594                                        int cancel)
5595 {
5596         if (osb->osb_tl_inode) {
5597                 /* We want to push off log flushes while truncates are
5598                  * still running. */
5599                 if (cancel)
5600                         cancel_delayed_work(&osb->osb_truncate_log_wq);
5601
5602                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5603                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5604         }
5605 }
5606
5607 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5608                                        int slot_num,
5609                                        struct inode **tl_inode,
5610                                        struct buffer_head **tl_bh)
5611 {
5612         int status;
5613         struct inode *inode = NULL;
5614         struct buffer_head *bh = NULL;
5615
5616         inode = ocfs2_get_system_file_inode(osb,
5617                                            TRUNCATE_LOG_SYSTEM_INODE,
5618                                            slot_num);
5619         if (!inode) {
5620                 status = -EINVAL;
5621                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5622                 goto bail;
5623         }
5624
5625         status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
5626                                   OCFS2_BH_CACHED, inode);
5627         if (status < 0) {
5628                 iput(inode);
5629                 mlog_errno(status);
5630                 goto bail;
5631         }
5632
5633         *tl_inode = inode;
5634         *tl_bh    = bh;
5635 bail:
5636         mlog_exit(status);
5637         return status;
5638 }
5639
5640 /* called during the 1st stage of node recovery. we stamp a clean
5641  * truncate log and pass back a copy for processing later. if the
5642  * truncate log does not require processing, a *tl_copy is set to
5643  * NULL. */
5644 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5645                                       int slot_num,
5646                                       struct ocfs2_dinode **tl_copy)
5647 {
5648         int status;
5649         struct inode *tl_inode = NULL;
5650         struct buffer_head *tl_bh = NULL;
5651         struct ocfs2_dinode *di;
5652         struct ocfs2_truncate_log *tl;
5653
5654         *tl_copy = NULL;
5655
5656         mlog(0, "recover truncate log from slot %d\n", slot_num);
5657
5658         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5659         if (status < 0) {
5660                 mlog_errno(status);
5661                 goto bail;
5662         }
5663
5664         di = (struct ocfs2_dinode *) tl_bh->b_data;
5665         tl = &di->id2.i_dealloc;
5666         if (!OCFS2_IS_VALID_DINODE(di)) {
5667                 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
5668                 status = -EIO;
5669                 goto bail;
5670         }
5671
5672         if (le16_to_cpu(tl->tl_used)) {
5673                 mlog(0, "We'll have %u logs to recover\n",
5674                      le16_to_cpu(tl->tl_used));
5675
5676                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
5677                 if (!(*tl_copy)) {
5678                         status = -ENOMEM;
5679                         mlog_errno(status);
5680                         goto bail;
5681                 }
5682
5683                 /* Assuming the write-out below goes well, this copy
5684                  * will be passed back to recovery for processing. */
5685                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
5686
5687                 /* All we need to do to clear the truncate log is set
5688                  * tl_used. */
5689                 tl->tl_used = 0;
5690
5691                 status = ocfs2_write_block(osb, tl_bh, tl_inode);
5692                 if (status < 0) {
5693                         mlog_errno(status);
5694                         goto bail;
5695                 }
5696         }
5697
5698 bail:
5699         if (tl_inode)
5700                 iput(tl_inode);
5701         if (tl_bh)
5702                 brelse(tl_bh);
5703
5704         if (status < 0 && (*tl_copy)) {
5705                 kfree(*tl_copy);
5706                 *tl_copy = NULL;
5707         }
5708
5709         mlog_exit(status);
5710         return status;
5711 }
5712
5713 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
5714                                          struct ocfs2_dinode *tl_copy)
5715 {
5716         int status = 0;
5717         int i;
5718         unsigned int clusters, num_recs, start_cluster;
5719         u64 start_blk;
5720         handle_t *handle;
5721         struct inode *tl_inode = osb->osb_tl_inode;
5722         struct ocfs2_truncate_log *tl;
5723
5724         mlog_entry_void();
5725
5726         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
5727                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
5728                 return -EINVAL;
5729         }
5730
5731         tl = &tl_copy->id2.i_dealloc;
5732         num_recs = le16_to_cpu(tl->tl_used);
5733         mlog(0, "cleanup %u records from %llu\n", num_recs,
5734              (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
5735
5736         mutex_lock(&tl_inode->i_mutex);
5737         for(i = 0; i < num_recs; i++) {
5738                 if (ocfs2_truncate_log_needs_flush(osb)) {
5739                         status = __ocfs2_flush_truncate_log(osb);
5740                         if (status < 0) {
5741                                 mlog_errno(status);
5742                                 goto bail_up;
5743                         }
5744                 }
5745
5746                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5747                 if (IS_ERR(handle)) {
5748                         status = PTR_ERR(handle);
5749                         mlog_errno(status);
5750                         goto bail_up;
5751                 }
5752
5753                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
5754                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
5755                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
5756
5757                 status = ocfs2_truncate_log_append(osb, handle,
5758                                                    start_blk, clusters);
5759                 ocfs2_commit_trans(osb, handle);
5760                 if (status < 0) {
5761                         mlog_errno(status);
5762                         goto bail_up;
5763                 }
5764         }
5765
5766 bail_up:
5767         mutex_unlock(&tl_inode->i_mutex);
5768
5769         mlog_exit(status);
5770         return status;
5771 }
5772
5773 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
5774 {
5775         int status;
5776         struct inode *tl_inode = osb->osb_tl_inode;
5777
5778         mlog_entry_void();
5779
5780         if (tl_inode) {
5781                 cancel_delayed_work(&osb->osb_truncate_log_wq);
5782                 flush_workqueue(ocfs2_wq);
5783
5784                 status = ocfs2_flush_truncate_log(osb);
5785                 if (status < 0)
5786                         mlog_errno(status);
5787
5788                 brelse(osb->osb_tl_bh);
5789                 iput(osb->osb_tl_inode);
5790         }
5791
5792         mlog_exit_void();
5793 }
5794
5795 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5796 {
5797         int status;
5798         struct inode *tl_inode = NULL;
5799         struct buffer_head *tl_bh = NULL;
5800
5801         mlog_entry_void();
5802
5803         status = ocfs2_get_truncate_log_info(osb,
5804                                              osb->slot_num,
5805                                              &tl_inode,
5806                                              &tl_bh);
5807         if (status < 0)
5808                 mlog_errno(status);
5809
5810         /* ocfs2_truncate_log_shutdown keys on the existence of
5811          * osb->osb_tl_inode so we don't set any of the osb variables
5812          * until we're sure all is well. */
5813         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
5814                           ocfs2_truncate_log_worker);
5815         osb->osb_tl_bh    = tl_bh;
5816         osb->osb_tl_inode = tl_inode;
5817
5818         mlog_exit(status);
5819         return status;
5820 }
5821
5822 /*
5823  * Delayed de-allocation of suballocator blocks.
5824  *
5825  * Some sets of block de-allocations might involve multiple suballocator inodes.
5826  *
5827  * The locking for this can get extremely complicated, especially when
5828  * the suballocator inodes to delete from aren't known until deep
5829  * within an unrelated codepath.
5830  *
5831  * ocfs2_extent_block structures are a good example of this - an inode
5832  * btree could have been grown by any number of nodes each allocating
5833  * out of their own suballoc inode.
5834  *
5835  * These structures allow the delay of block de-allocation until a
5836  * later time, when locking of multiple cluster inodes won't cause
5837  * deadlock.
5838  */
5839
5840 /*
5841  * Describes a single block free from a suballocator
5842  */
5843 struct ocfs2_cached_block_free {
5844         struct ocfs2_cached_block_free          *free_next;
5845         u64                                     free_blk;
5846         unsigned int                            free_bit;
5847 };
5848
5849 struct ocfs2_per_slot_free_list {
5850         struct ocfs2_per_slot_free_list         *f_next_suballocator;
5851         int                                     f_inode_type;
5852         int                                     f_slot;
5853         struct ocfs2_cached_block_free          *f_first;
5854 };
5855
5856 static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5857                                    int sysfile_type,
5858                                    int slot,
5859                                    struct ocfs2_cached_block_free *head)
5860 {
5861         int ret;
5862         u64 bg_blkno;
5863         handle_t *handle;
5864         struct inode *inode;
5865         struct buffer_head *di_bh = NULL;
5866         struct ocfs2_cached_block_free *tmp;
5867
5868         inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5869         if (!inode) {
5870                 ret = -EINVAL;
5871                 mlog_errno(ret);
5872                 goto out;
5873         }
5874
5875         mutex_lock(&inode->i_mutex);
5876
5877         ret = ocfs2_inode_lock(inode, &di_bh, 1);
5878         if (ret) {
5879                 mlog_errno(ret);
5880                 goto out_mutex;
5881         }
5882
5883         handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5884         if (IS_ERR(handle)) {
5885                 ret = PTR_ERR(handle);
5886                 mlog_errno(ret);
5887                 goto out_unlock;
5888         }
5889
5890         while (head) {
5891                 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5892                                                       head->free_bit);
5893                 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5894                      head->free_bit, (unsigned long long)head->free_blk);
5895
5896                 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5897                                                head->free_bit, bg_blkno, 1);
5898                 if (ret) {
5899                         mlog_errno(ret);
5900                         goto out_journal;
5901                 }
5902
5903                 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5904                 if (ret) {
5905                         mlog_errno(ret);
5906                         goto out_journal;
5907                 }
5908
5909                 tmp = head;
5910                 head = head->free_next;
5911                 kfree(tmp);
5912         }
5913
5914 out_journal:
5915         ocfs2_commit_trans(osb, handle);
5916
5917 out_unlock:
5918         ocfs2_inode_unlock(inode, 1);
5919         brelse(di_bh);
5920 out_mutex:
5921         mutex_unlock(&inode->i_mutex);
5922         iput(inode);
5923 out:
5924         while(head) {
5925                 /* Premature exit may have left some dangling items. */
5926                 tmp = head;
5927                 head = head->free_next;
5928                 kfree(tmp);
5929         }
5930
5931         return ret;
5932 }
5933
5934 int ocfs2_run_deallocs(struct ocfs2_super *osb,
5935                        struct ocfs2_cached_dealloc_ctxt *ctxt)
5936 {
5937         int ret = 0, ret2;
5938         struct ocfs2_per_slot_free_list *fl;
5939
5940         if (!ctxt)
5941                 return 0;
5942
5943         while (ctxt->c_first_suballocator) {
5944                 fl = ctxt->c_first_suballocator;
5945
5946                 if (fl->f_first) {
5947                         mlog(0, "Free items: (type %u, slot %d)\n",
5948                              fl->f_inode_type, fl->f_slot);
5949                         ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
5950                                                        fl->f_slot, fl->f_first);
5951                         if (ret2)
5952                                 mlog_errno(ret2);
5953                         if (!ret)
5954                                 ret = ret2;
5955                 }
5956
5957                 ctxt->c_first_suballocator = fl->f_next_suballocator;
5958                 kfree(fl);
5959         }
5960
5961         return ret;
5962 }
5963
5964 static struct ocfs2_per_slot_free_list *
5965 ocfs2_find_per_slot_free_list(int type,
5966                               int slot,
5967                               struct ocfs2_cached_dealloc_ctxt *ctxt)
5968 {
5969         struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
5970
5971         while (fl) {
5972                 if (fl->f_inode_type == type && fl->f_slot == slot)
5973                         return fl;
5974
5975                 fl = fl->f_next_suballocator;
5976         }
5977
5978         fl = kmalloc(sizeof(*fl), GFP_NOFS);
5979         if (fl) {
5980                 fl->f_inode_type = type;
5981                 fl->f_slot = slot;
5982                 fl->f_first = NULL;
5983                 fl->f_next_suballocator = ctxt->c_first_suballocator;
5984
5985                 ctxt->c_first_suballocator = fl;
5986         }
5987         return fl;
5988 }
5989
5990 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
5991                                      int type, int slot, u64 blkno,
5992                                      unsigned int bit)
5993 {
5994         int ret;
5995         struct ocfs2_per_slot_free_list *fl;
5996         struct ocfs2_cached_block_free *item;
5997
5998         fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
5999         if (fl == NULL) {
6000                 ret = -ENOMEM;
6001                 mlog_errno(ret);
6002                 goto out;
6003         }
6004
6005         item = kmalloc(sizeof(*item), GFP_NOFS);
6006         if (item == NULL) {
6007                 ret = -ENOMEM;
6008                 mlog_errno(ret);
6009                 goto out;
6010         }
6011
6012         mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
6013              type, slot, bit, (unsigned long long)blkno);
6014
6015         item->free_blk = blkno;
6016         item->free_bit = bit;
6017         item->free_next = fl->f_first;
6018
6019         fl->f_first = item;
6020
6021         ret = 0;
6022 out:
6023         return ret;
6024 }
6025
6026 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6027                                          struct ocfs2_extent_block *eb)
6028 {
6029         return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6030                                          le16_to_cpu(eb->h_suballoc_slot),
6031                                          le64_to_cpu(eb->h_blkno),
6032                                          le16_to_cpu(eb->h_suballoc_bit));
6033 }
6034
6035 /* This function will figure out whether the currently last extent
6036  * block will be deleted, and if it will, what the new last extent
6037  * block will be so we can update his h_next_leaf_blk field, as well
6038  * as the dinodes i_last_eb_blk */
6039 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6040                                        unsigned int clusters_to_del,
6041                                        struct ocfs2_path *path,
6042                                        struct buffer_head **new_last_eb)
6043 {
6044         int next_free, ret = 0;
6045         u32 cpos;
6046         struct ocfs2_extent_rec *rec;
6047         struct ocfs2_extent_block *eb;
6048         struct ocfs2_extent_list *el;
6049         struct buffer_head *bh = NULL;
6050
6051         *new_last_eb = NULL;
6052
6053         /* we have no tree, so of course, no last_eb. */
6054         if (!path->p_tree_depth)
6055                 goto out;
6056
6057         /* trunc to zero special case - this makes tree_depth = 0
6058          * regardless of what it is.  */
6059         if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6060                 goto out;
6061
6062         el = path_leaf_el(path);
6063         BUG_ON(!el->l_next_free_rec);
6064
6065         /*
6066          * Make sure that this extent list will actually be empty
6067          * after we clear away the data. We can shortcut out if
6068          * there's more than one non-empty extent in the
6069          * list. Otherwise, a check of the remaining extent is
6070          * necessary.
6071          */
6072         next_free = le16_to_cpu(el->l_next_free_rec);
6073         rec = NULL;
6074         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6075                 if (next_free > 2)
6076                         goto out;
6077
6078                 /* We may have a valid extent in index 1, check it. */
6079                 if (next_free == 2)
6080                         rec = &el->l_recs[1];
6081
6082                 /*
6083                  * Fall through - no more nonempty extents, so we want
6084                  * to delete this leaf.
6085                  */
6086         } else {
6087                 if (next_free > 1)
6088                         goto out;
6089
6090                 rec = &el->l_recs[0];
6091         }
6092
6093         if (rec) {
6094                 /*
6095                  * Check it we'll only be trimming off the end of this
6096                  * cluster.
6097                  */
6098                 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6099                         goto out;
6100         }
6101
6102         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6103         if (ret) {
6104                 mlog_errno(ret);
6105                 goto out;
6106         }
6107
6108         ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
6109         if (ret) {
6110                 mlog_errno(ret);
6111                 goto out;
6112         }
6113
6114         eb = (struct ocfs2_extent_block *) bh->b_data;
6115         el = &eb->h_list;
6116         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6117                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6118                 ret = -EROFS;
6119                 goto out;
6120         }
6121
6122         *new_last_eb = bh;
6123         get_bh(*new_last_eb);
6124         mlog(0, "returning block %llu, (cpos: %u)\n",
6125              (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6126 out:
6127         brelse(bh);
6128
6129         return ret;
6130 }
6131
6132 /*
6133  * Trim some clusters off the rightmost edge of a tree. Only called
6134  * during truncate.
6135  *
6136  * The caller needs to:
6137  *   - start journaling of each path component.
6138  *   - compute and fully set up any new last ext block
6139  */
6140 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6141                            handle_t *handle, struct ocfs2_truncate_context *tc,
6142                            u32 clusters_to_del, u64 *delete_start)
6143 {
6144         int ret, i, index = path->p_tree_depth;
6145         u32 new_edge = 0;
6146         u64 deleted_eb = 0;
6147         struct buffer_head *bh;
6148         struct ocfs2_extent_list *el;
6149         struct ocfs2_extent_rec *rec;
6150
6151         *delete_start = 0;
6152
6153         while (index >= 0) {
6154                 bh = path->p_node[index].bh;
6155                 el = path->p_node[index].el;
6156
6157                 mlog(0, "traveling tree (index = %d, block = %llu)\n",
6158                      index,  (unsigned long long)bh->b_blocknr);
6159
6160                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6161
6162                 if (index !=
6163                     (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6164                         ocfs2_error(inode->i_sb,
6165                                     "Inode %lu has invalid ext. block %llu",
6166                                     inode->i_ino,
6167                                     (unsigned long long)bh->b_blocknr);
6168                         ret = -EROFS;
6169                         goto out;
6170                 }
6171
6172 find_tail_record:
6173                 i = le16_to_cpu(el->l_next_free_rec) - 1;
6174                 rec = &el->l_recs[i];
6175
6176                 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6177                      "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6178                      ocfs2_rec_clusters(el, rec),
6179                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6180                      le16_to_cpu(el->l_next_free_rec));
6181
6182                 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6183
6184                 if (le16_to_cpu(el->l_tree_depth) == 0) {
6185                         /*
6186                          * If the leaf block contains a single empty
6187                          * extent and no records, we can just remove
6188                          * the block.
6189                          */
6190                         if (i == 0 && ocfs2_is_empty_extent(rec)) {
6191                                 memset(rec, 0,
6192                                        sizeof(struct ocfs2_extent_rec));
6193                                 el->l_next_free_rec = cpu_to_le16(0);
6194
6195                                 goto delete;
6196                         }
6197
6198                         /*
6199                          * Remove any empty extents by shifting things
6200                          * left. That should make life much easier on
6201                          * the code below. This condition is rare
6202                          * enough that we shouldn't see a performance
6203                          * hit.
6204                          */
6205                         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6206                                 le16_add_cpu(&el->l_next_free_rec, -1);
6207
6208                                 for(i = 0;
6209                                     i < le16_to_cpu(el->l_next_free_rec); i++)
6210                                         el->l_recs[i] = el->l_recs[i + 1];
6211
6212                                 memset(&el->l_recs[i], 0,
6213                                        sizeof(struct ocfs2_extent_rec));
6214
6215                                 /*
6216                                  * We've modified our extent list. The
6217                                  * simplest way to handle this change
6218                                  * is to being the search from the
6219                                  * start again.
6220                                  */
6221                                 goto find_tail_record;
6222                         }
6223
6224                         le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6225
6226                         /*
6227                          * We'll use "new_edge" on our way back up the
6228                          * tree to know what our rightmost cpos is.
6229                          */
6230                         new_edge = le16_to_cpu(rec->e_leaf_clusters);
6231                         new_edge += le32_to_cpu(rec->e_cpos);
6232
6233                         /*
6234                          * The caller will use this to delete data blocks.
6235                          */
6236                         *delete_start = le64_to_cpu(rec->e_blkno)
6237                                 + ocfs2_clusters_to_blocks(inode->i_sb,
6238                                         le16_to_cpu(rec->e_leaf_clusters));
6239
6240                         /*
6241                          * If it's now empty, remove this record.
6242                          */
6243                         if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6244                                 memset(rec, 0,
6245                                        sizeof(struct ocfs2_extent_rec));
6246                                 le16_add_cpu(&el->l_next_free_rec, -1);
6247                         }
6248                 } else {
6249                         if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6250                                 memset(rec, 0,
6251                                        sizeof(struct ocfs2_extent_rec));
6252                                 le16_add_cpu(&el->l_next_free_rec, -1);
6253
6254                                 goto delete;
6255                         }
6256
6257                         /* Can this actually happen? */
6258                         if (le16_to_cpu(el->l_next_free_rec) == 0)
6259                                 goto delete;
6260
6261                         /*
6262                          * We never actually deleted any clusters
6263                          * because our leaf was empty. There's no
6264                          * reason to adjust the rightmost edge then.
6265                          */
6266                         if (new_edge == 0)
6267                                 goto delete;
6268
6269                         rec->e_int_clusters = cpu_to_le32(new_edge);
6270                         le32_add_cpu(&rec->e_int_clusters,
6271                                      -le32_to_cpu(rec->e_cpos));
6272
6273                          /*
6274                           * A deleted child record should have been
6275                           * caught above.
6276                           */
6277                          BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6278                 }
6279
6280 delete:
6281                 ret = ocfs2_journal_dirty(handle, bh);
6282                 if (ret) {
6283                         mlog_errno(ret);
6284                         goto out;
6285                 }
6286
6287                 mlog(0, "extent list container %llu, after: record %d: "
6288                      "(%u, %u, %llu), next = %u.\n",
6289                      (unsigned long long)bh->b_blocknr, i,
6290                      le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6291                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6292                      le16_to_cpu(el->l_next_free_rec));
6293
6294                 /*
6295                  * We must be careful to only attempt delete of an
6296                  * extent block (and not the root inode block).
6297                  */
6298                 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6299                         struct ocfs2_extent_block *eb =
6300                                 (struct ocfs2_extent_block *)bh->b_data;
6301
6302                         /*
6303                          * Save this for use when processing the
6304                          * parent block.
6305                          */
6306                         deleted_eb = le64_to_cpu(eb->h_blkno);
6307
6308                         mlog(0, "deleting this extent block.\n");
6309
6310                         ocfs2_remove_from_cache(inode, bh);
6311
6312                         BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6313                         BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6314                         BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6315
6316                         ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6317                         /* An error here is not fatal. */
6318                         if (ret < 0)
6319                                 mlog_errno(ret);
6320                 } else {
6321                         deleted_eb = 0;
6322                 }
6323
6324                 index--;
6325         }
6326
6327         ret = 0;
6328 out:
6329         return ret;
6330 }
6331
6332 static int ocfs2_do_truncate(struct ocfs2_super *osb,
6333                              unsigned int clusters_to_del,
6334                              struct inode *inode,
6335                              struct buffer_head *fe_bh,
6336                              handle_t *handle,
6337                              struct ocfs2_truncate_context *tc,
6338                              struct ocfs2_path *path)
6339 {
6340         int status;
6341         struct ocfs2_dinode *fe;
6342         struct ocfs2_extent_block *last_eb = NULL;
6343         struct ocfs2_extent_list *el;
6344         struct buffer_head *last_eb_bh = NULL;
6345         u64 delete_blk = 0;
6346
6347         fe = (struct ocfs2_dinode *) fe_bh->b_data;
6348
6349         status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6350                                              path, &last_eb_bh);
6351         if (status < 0) {
6352                 mlog_errno(status);
6353                 goto bail;
6354         }
6355
6356         /*
6357          * Each component will be touched, so we might as well journal
6358          * here to avoid having to handle errors later.
6359          */
6360         status = ocfs2_journal_access_path(inode, handle, path);
6361         if (status < 0) {
6362                 mlog_errno(status);
6363                 goto bail;
6364         }
6365
6366         if (last_eb_bh) {
6367                 status = ocfs2_journal_access(handle, inode, last_eb_bh,
6368                                               OCFS2_JOURNAL_ACCESS_WRITE);
6369                 if (status < 0) {
6370                         mlog_errno(status);
6371                         goto bail;
6372                 }
6373
6374                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6375         }
6376
6377         el = &(fe->id2.i_list);
6378
6379         /*
6380          * Lower levels depend on this never happening, but it's best
6381          * to check it up here before changing the tree.
6382          */
6383         if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6384                 ocfs2_error(inode->i_sb,
6385                             "Inode %lu has an empty extent record, depth %u\n",
6386                             inode->i_ino, le16_to_cpu(el->l_tree_depth));
6387                 status = -EROFS;
6388                 goto bail;
6389         }
6390
6391         spin_lock(&OCFS2_I(inode)->ip_lock);
6392         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6393                                       clusters_to_del;
6394         spin_unlock(&OCFS2_I(inode)->ip_lock);
6395         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6396         inode->i_blocks = ocfs2_inode_sector_count(inode);
6397
6398         status = ocfs2_trim_tree(inode, path, handle, tc,
6399                                  clusters_to_del, &delete_blk);
6400         if (status) {
6401                 mlog_errno(status);
6402                 goto bail;
6403         }
6404
6405         if (le32_to_cpu(fe->i_clusters) == 0) {
6406                 /* trunc to zero is a special case. */
6407                 el->l_tree_depth = 0;
6408                 fe->i_last_eb_blk = 0;
6409         } else if (last_eb)
6410                 fe->i_last_eb_blk = last_eb->h_blkno;
6411
6412         status = ocfs2_journal_dirty(handle, fe_bh);
6413         if (status < 0) {
6414                 mlog_errno(status);
6415                 goto bail;
6416         }
6417
6418         if (last_eb) {
6419                 /* If there will be a new last extent block, then by
6420                  * definition, there cannot be any leaves to the right of
6421                  * him. */
6422                 last_eb->h_next_leaf_blk = 0;
6423                 status = ocfs2_journal_dirty(handle, last_eb_bh);
6424                 if (status < 0) {
6425                         mlog_errno(status);
6426                         goto bail;
6427                 }
6428         }
6429
6430         if (delete_blk) {
6431                 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
6432                                                    clusters_to_del);
6433                 if (status < 0) {
6434                         mlog_errno(status);
6435                         goto bail;
6436                 }
6437         }
6438         status = 0;
6439 bail:
6440
6441         mlog_exit(status);
6442         return status;
6443 }
6444
6445 static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
6446 {
6447         set_buffer_uptodate(bh);
6448         mark_buffer_dirty(bh);
6449         return 0;
6450 }
6451
6452 static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
6453 {
6454         set_buffer_uptodate(bh);
6455         mark_buffer_dirty(bh);
6456         return ocfs2_journal_dirty_data(handle, bh);
6457 }
6458
6459 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6460                                      unsigned int from, unsigned int to,
6461                                      struct page *page, int zero, u64 *phys)
6462 {
6463         int ret, partial = 0;
6464
6465         ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6466         if (ret)
6467                 mlog_errno(ret);
6468
6469         if (zero)
6470                 zero_user_segment(page, from, to);
6471
6472         /*
6473          * Need to set the buffers we zero'd into uptodate
6474          * here if they aren't - ocfs2_map_page_blocks()
6475          * might've skipped some
6476          */
6477         if (ocfs2_should_order_data(inode)) {
6478                 ret = walk_page_buffers(handle,
6479                                         page_buffers(page),
6480                                         from, to, &partial,
6481                                         ocfs2_ordered_zero_func);
6482                 if (ret < 0)
6483                         mlog_errno(ret);
6484         } else {
6485                 ret = walk_page_buffers(handle, page_buffers(page),
6486                                         from, to, &partial,
6487                                         ocfs2_writeback_zero_func);
6488                 if (ret < 0)
6489                         mlog_errno(ret);
6490         }
6491
6492         if (!partial)
6493                 SetPageUptodate(page);
6494
6495         flush_dcache_page(page);
6496 }
6497
6498 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6499                                      loff_t end, struct page **pages,
6500                                      int numpages, u64 phys, handle_t *handle)
6501 {
6502         int i;
6503         struct page *page;
6504         unsigned int from, to = PAGE_CACHE_SIZE;
6505         struct super_block *sb = inode->i_sb;
6506
6507         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6508
6509         if (numpages == 0)
6510                 goto out;
6511
6512         to = PAGE_CACHE_SIZE;
6513         for(i = 0; i < numpages; i++) {
6514                 page = pages[i];
6515
6516                 from = start & (PAGE_CACHE_SIZE - 1);
6517                 if ((end >> PAGE_CACHE_SHIFT) == page->index)
6518                         to = end & (PAGE_CACHE_SIZE - 1);
6519
6520                 BUG_ON(from > PAGE_CACHE_SIZE);
6521                 BUG_ON(to > PAGE_CACHE_SIZE);
6522
6523                 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6524                                          &phys);
6525
6526                 start = (page->index + 1) << PAGE_CACHE_SHIFT;
6527         }
6528 out:
6529         if (pages)
6530                 ocfs2_unlock_and_free_pages(pages, numpages);
6531 }
6532
6533 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6534                                 struct page **pages, int *num)
6535 {
6536         int numpages, ret = 0;
6537         struct super_block *sb = inode->i_sb;
6538         struct address_space *mapping = inode->i_mapping;
6539         unsigned long index;
6540         loff_t last_page_bytes;
6541
6542         BUG_ON(start > end);
6543
6544         BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6545                (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6546
6547         numpages = 0;
6548         last_page_bytes = PAGE_ALIGN(end);
6549         index = start >> PAGE_CACHE_SHIFT;
6550         do {
6551                 pages[numpages] = grab_cache_page(mapping, index);
6552                 if (!pages[numpages]) {
6553                         ret = -ENOMEM;
6554                         mlog_errno(ret);
6555                         goto out;
6556                 }
6557
6558                 numpages++;
6559                 index++;
6560         } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6561
6562 out:
6563         if (ret != 0) {
6564                 if (pages)
6565                         ocfs2_unlock_and_free_pages(pages, numpages);
6566                 numpages = 0;
6567         }
6568
6569         *num = numpages;
6570
6571         return ret;
6572 }
6573
6574 /*
6575  * Zero the area past i_size but still within an allocated
6576  * cluster. This avoids exposing nonzero data on subsequent file
6577  * extends.
6578  *
6579  * We need to call this before i_size is updated on the inode because
6580  * otherwise block_write_full_page() will skip writeout of pages past
6581  * i_size. The new_i_size parameter is passed for this reason.
6582  */
6583 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6584                                   u64 range_start, u64 range_end)
6585 {
6586         int ret = 0, numpages;
6587         struct page **pages = NULL;
6588         u64 phys;
6589         unsigned int ext_flags;
6590         struct super_block *sb = inode->i_sb;
6591
6592         /*
6593          * File systems which don't support sparse files zero on every
6594          * extend.
6595          */
6596         if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6597                 return 0;
6598
6599         pages = kcalloc(ocfs2_pages_per_cluster(sb),
6600                         sizeof(struct page *), GFP_NOFS);
6601         if (pages == NULL) {
6602                 ret = -ENOMEM;
6603                 mlog_errno(ret);
6604                 goto out;
6605         }
6606
6607         if (range_start == range_end)
6608                 goto out;
6609
6610         ret = ocfs2_extent_map_get_blocks(inode,
6611                                           range_start >> sb->s_blocksize_bits,
6612                                           &phys, NULL, &ext_flags);
6613         if (ret) {
6614                 mlog_errno(ret);
6615                 goto out;
6616         }
6617
6618         /*
6619          * Tail is a hole, or is marked unwritten. In either case, we
6620          * can count on read and write to return/push zero's.
6621          */
6622         if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6623                 goto out;
6624
6625         ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6626                                    &numpages);
6627         if (ret) {
6628                 mlog_errno(ret);
6629                 goto out;
6630         }
6631
6632         ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6633                                  numpages, phys, handle);
6634
6635         /*
6636          * Initiate writeout of the pages we zero'd here. We don't
6637          * wait on them - the truncate_inode_pages() call later will
6638          * do that for us.
6639          */
6640         ret = do_sync_mapping_range(inode->i_mapping, range_start,
6641                                     range_end - 1, SYNC_FILE_RANGE_WRITE);
6642         if (ret)
6643                 mlog_errno(ret);
6644
6645 out:
6646         if (pages)
6647                 kfree(pages);
6648
6649         return ret;
6650 }
6651
6652 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
6653                                              struct ocfs2_dinode *di)
6654 {
6655         unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6656         unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
6657
6658         if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
6659                 memset(&di->id2, 0, blocksize -
6660                                     offsetof(struct ocfs2_dinode, id2) -
6661                                     xattrsize);
6662         else
6663                 memset(&di->id2, 0, blocksize -
6664                                     offsetof(struct ocfs2_dinode, id2));
6665 }
6666
6667 void ocfs2_dinode_new_extent_list(struct inode *inode,
6668                                   struct ocfs2_dinode *di)
6669 {
6670         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6671         di->id2.i_list.l_tree_depth = 0;
6672         di->id2.i_list.l_next_free_rec = 0;
6673         di->id2.i_list.l_count = cpu_to_le16(
6674                 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
6675 }
6676
6677 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6678 {
6679         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6680         struct ocfs2_inline_data *idata = &di->id2.i_data;
6681
6682         spin_lock(&oi->ip_lock);
6683         oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6684         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6685         spin_unlock(&oi->ip_lock);
6686
6687         /*
6688          * We clear the entire i_data structure here so that all
6689          * fields can be properly initialized.
6690          */
6691         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6692
6693         idata->id_count = cpu_to_le16(
6694                         ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
6695 }
6696
6697 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6698                                          struct buffer_head *di_bh)
6699 {
6700         int ret, i, has_data, num_pages = 0;
6701         handle_t *handle;
6702         u64 uninitialized_var(block);
6703         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6704         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6705         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6706         struct ocfs2_alloc_context *data_ac = NULL;
6707         struct page **pages = NULL;
6708         loff_t end = osb->s_clustersize;
6709
6710         has_data = i_size_read(inode) ? 1 : 0;
6711
6712         if (has_data) {
6713                 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6714                                 sizeof(struct page *), GFP_NOFS);
6715                 if (pages == NULL) {
6716                         ret = -ENOMEM;
6717                         mlog_errno(ret);
6718                         goto out;
6719                 }
6720
6721                 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6722                 if (ret) {
6723                         mlog_errno(ret);
6724                         goto out;
6725                 }
6726         }
6727
6728         handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
6729         if (IS_ERR(handle)) {
6730                 ret = PTR_ERR(handle);
6731                 mlog_errno(ret);
6732                 goto out_unlock;
6733         }
6734
6735         ret = ocfs2_journal_access(handle, inode, di_bh,
6736                                    OCFS2_JOURNAL_ACCESS_WRITE);
6737         if (ret) {
6738                 mlog_errno(ret);
6739                 goto out_commit;
6740         }
6741
6742         if (has_data) {
6743                 u32 bit_off, num;
6744                 unsigned int page_end;
6745                 u64 phys;
6746
6747                 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6748                                            &num);
6749                 if (ret) {
6750                         mlog_errno(ret);
6751                         goto out_commit;
6752                 }
6753
6754                 /*
6755                  * Save two copies, one for insert, and one that can
6756                  * be changed by ocfs2_map_and_dirty_page() below.
6757                  */
6758                 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6759
6760                 /*
6761                  * Non sparse file systems zero on extend, so no need
6762                  * to do that now.
6763                  */
6764                 if (!ocfs2_sparse_alloc(osb) &&
6765                     PAGE_CACHE_SIZE < osb->s_clustersize)
6766                         end = PAGE_CACHE_SIZE;
6767
6768                 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6769                 if (ret) {
6770                         mlog_errno(ret);
6771                         goto out_commit;
6772                 }
6773
6774                 /*
6775                  * This should populate the 1st page for us and mark
6776                  * it up to date.
6777                  */
6778                 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6779                 if (ret) {
6780                         mlog_errno(ret);
6781                         goto out_commit;
6782                 }
6783
6784                 page_end = PAGE_CACHE_SIZE;
6785                 if (PAGE_CACHE_SIZE > osb->s_clustersize)
6786                         page_end = osb->s_clustersize;
6787
6788                 for (i = 0; i < num_pages; i++)
6789                         ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
6790                                                  pages[i], i > 0, &phys);
6791         }
6792
6793         spin_lock(&oi->ip_lock);
6794         oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
6795         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6796         spin_unlock(&oi->ip_lock);
6797
6798         ocfs2_dinode_new_extent_list(inode, di);
6799
6800         ocfs2_journal_dirty(handle, di_bh);
6801
6802         if (has_data) {
6803                 /*
6804                  * An error at this point should be extremely rare. If
6805                  * this proves to be false, we could always re-build
6806                  * the in-inode data from our pages.
6807                  */
6808                 ret = ocfs2_dinode_insert_extent(osb, handle, inode, di_bh,
6809                                                  0, block, 1, 0, NULL);
6810                 if (ret) {
6811                         mlog_errno(ret);
6812                         goto out_commit;
6813                 }
6814
6815                 inode->i_blocks = ocfs2_inode_sector_count(inode);
6816         }
6817
6818 out_commit:
6819         ocfs2_commit_trans(osb, handle);
6820
6821 out_unlock:
6822         if (data_ac)
6823                 ocfs2_free_alloc_context(data_ac);
6824
6825 out:
6826         if (pages) {
6827                 ocfs2_unlock_and_free_pages(pages, num_pages);
6828                 kfree(pages);
6829         }
6830
6831         return ret;
6832 }
6833
6834 /*
6835  * It is expected, that by the time you call this function,
6836  * inode->i_size and fe->i_size have been adjusted.
6837  *
6838  * WARNING: This will kfree the truncate context
6839  */
6840 int ocfs2_commit_truncate(struct ocfs2_super *osb,
6841                           struct inode *inode,
6842                           struct buffer_head *fe_bh,
6843                           struct ocfs2_truncate_context *tc)
6844 {
6845         int status, i, credits, tl_sem = 0;
6846         u32 clusters_to_del, new_highest_cpos, range;
6847         struct ocfs2_extent_list *el;
6848         handle_t *handle = NULL;
6849         struct inode *tl_inode = osb->osb_tl_inode;
6850         struct ocfs2_path *path = NULL;
6851         struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
6852
6853         mlog_entry_void();
6854
6855         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6856                                                      i_size_read(inode));
6857
6858         path = ocfs2_new_path(fe_bh, &di->id2.i_list);
6859         if (!path) {
6860                 status = -ENOMEM;
6861                 mlog_errno(status);
6862                 goto bail;
6863         }
6864
6865         ocfs2_extent_map_trunc(inode, new_highest_cpos);
6866
6867 start:
6868         /*
6869          * Check that we still have allocation to delete.
6870          */
6871         if (OCFS2_I(inode)->ip_clusters == 0) {
6872                 status = 0;
6873                 goto bail;
6874         }
6875
6876         /*
6877          * Truncate always works against the rightmost tree branch.
6878          */
6879         status = ocfs2_find_path(inode, path, UINT_MAX);
6880         if (status) {
6881                 mlog_errno(status);
6882                 goto bail;
6883         }
6884
6885         mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
6886              OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
6887
6888         /*
6889          * By now, el will point to the extent list on the bottom most
6890          * portion of this tree. Only the tail record is considered in
6891          * each pass.
6892          *
6893          * We handle the following cases, in order:
6894          * - empty extent: delete the remaining branch
6895          * - remove the entire record
6896          * - remove a partial record
6897          * - no record needs to be removed (truncate has completed)
6898          */
6899         el = path_leaf_el(path);
6900         if (le16_to_cpu(el->l_next_free_rec) == 0) {
6901                 ocfs2_error(inode->i_sb,
6902                             "Inode %llu has empty extent block at %llu\n",
6903                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
6904                             (unsigned long long)path_leaf_bh(path)->b_blocknr);
6905                 status = -EROFS;
6906                 goto bail;
6907         }
6908
6909         i = le16_to_cpu(el->l_next_free_rec) - 1;
6910         range = le32_to_cpu(el->l_recs[i].e_cpos) +
6911                 ocfs2_rec_clusters(el, &el->l_recs[i]);
6912         if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
6913                 clusters_to_del = 0;
6914         } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
6915                 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
6916         } else if (range > new_highest_cpos) {
6917                 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
6918                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
6919                                   new_highest_cpos;
6920         } else {
6921                 status = 0;
6922                 goto bail;
6923         }
6924
6925         mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
6926              clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
6927
6928         mutex_lock(&tl_inode->i_mutex);
6929         tl_sem = 1;
6930         /* ocfs2_truncate_log_needs_flush guarantees us at least one
6931          * record is free for use. If there isn't any, we flush to get
6932          * an empty truncate log.  */
6933         if (ocfs2_truncate_log_needs_flush(osb)) {
6934                 status = __ocfs2_flush_truncate_log(osb);
6935                 if (status < 0) {
6936                         mlog_errno(status);
6937                         goto bail;
6938                 }
6939         }
6940
6941         credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
6942                                                 (struct ocfs2_dinode *)fe_bh->b_data,
6943                                                 el);
6944         handle = ocfs2_start_trans(osb, credits);
6945         if (IS_ERR(handle)) {
6946                 status = PTR_ERR(handle);
6947                 handle = NULL;
6948                 mlog_errno(status);
6949                 goto bail;
6950         }
6951
6952         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
6953                                    tc, path);
6954         if (status < 0) {
6955                 mlog_errno(status);
6956                 goto bail;
6957         }
6958
6959         mutex_unlock(&tl_inode->i_mutex);
6960         tl_sem = 0;
6961
6962         ocfs2_commit_trans(osb, handle);
6963         handle = NULL;
6964
6965         ocfs2_reinit_path(path, 1);
6966
6967         /*
6968          * The check above will catch the case where we've truncated
6969          * away all allocation.
6970          */
6971         goto start;
6972
6973 bail:
6974
6975         ocfs2_schedule_truncate_log_flush(osb, 1);
6976
6977         if (tl_sem)
6978                 mutex_unlock(&tl_inode->i_mutex);
6979
6980         if (handle)
6981                 ocfs2_commit_trans(osb, handle);
6982
6983         ocfs2_run_deallocs(osb, &tc->tc_dealloc);
6984
6985         ocfs2_free_path(path);
6986
6987         /* This will drop the ext_alloc cluster lock for us */
6988         ocfs2_free_truncate_context(tc);
6989
6990         mlog_exit(status);
6991         return status;
6992 }
6993
6994 /*
6995  * Expects the inode to already be locked.
6996  */
6997 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
6998                            struct inode *inode,
6999                            struct buffer_head *fe_bh,
7000                            struct ocfs2_truncate_context **tc)
7001 {
7002         int status;
7003         unsigned int new_i_clusters;
7004         struct ocfs2_dinode *fe;
7005         struct ocfs2_extent_block *eb;
7006         struct buffer_head *last_eb_bh = NULL;
7007
7008         mlog_entry_void();
7009
7010         *tc = NULL;
7011
7012         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
7013                                                   i_size_read(inode));
7014         fe = (struct ocfs2_dinode *) fe_bh->b_data;
7015
7016         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
7017              "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
7018              (unsigned long long)le64_to_cpu(fe->i_size));
7019
7020         *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
7021         if (!(*tc)) {
7022                 status = -ENOMEM;
7023                 mlog_errno(status);
7024                 goto bail;
7025         }
7026         ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7027
7028         if (fe->id2.i_list.l_tree_depth) {
7029                 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
7030                                           &last_eb_bh, OCFS2_BH_CACHED, inode);
7031                 if (status < 0) {
7032                         mlog_errno(status);
7033                         goto bail;
7034                 }
7035                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
7036                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
7037                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
7038
7039                         brelse(last_eb_bh);
7040                         status = -EIO;
7041                         goto bail;
7042                 }
7043         }
7044
7045         (*tc)->tc_last_eb_bh = last_eb_bh;
7046
7047         status = 0;
7048 bail:
7049         if (status < 0) {
7050                 if (*tc)
7051                         ocfs2_free_truncate_context(*tc);
7052                 *tc = NULL;
7053         }
7054         mlog_exit_void();
7055         return status;
7056 }
7057
7058 /*
7059  * 'start' is inclusive, 'end' is not.
7060  */
7061 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7062                           unsigned int start, unsigned int end, int trunc)
7063 {
7064         int ret;
7065         unsigned int numbytes;
7066         handle_t *handle;
7067         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7068         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7069         struct ocfs2_inline_data *idata = &di->id2.i_data;
7070
7071         if (end > i_size_read(inode))
7072                 end = i_size_read(inode);
7073
7074         BUG_ON(start >= end);
7075
7076         if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7077             !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7078             !ocfs2_supports_inline_data(osb)) {
7079                 ocfs2_error(inode->i_sb,
7080                             "Inline data flags for inode %llu don't agree! "
7081                             "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7082                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7083                             le16_to_cpu(di->i_dyn_features),
7084                             OCFS2_I(inode)->ip_dyn_features,
7085                             osb->s_feature_incompat);
7086                 ret = -EROFS;
7087                 goto out;
7088         }
7089
7090         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7091         if (IS_ERR(handle)) {
7092                 ret = PTR_ERR(handle);
7093                 mlog_errno(ret);
7094                 goto out;
7095         }
7096
7097         ret = ocfs2_journal_access(handle, inode, di_bh,
7098                                    OCFS2_JOURNAL_ACCESS_WRITE);
7099         if (ret) {
7100                 mlog_errno(ret);
7101                 goto out_commit;
7102         }
7103
7104         numbytes = end - start;
7105         memset(idata->id_data + start, 0, numbytes);
7106
7107         /*
7108          * No need to worry about the data page here - it's been
7109          * truncated already and inline data doesn't need it for
7110          * pushing zero's to disk, so we'll let readpage pick it up
7111          * later.
7112          */
7113         if (trunc) {
7114                 i_size_write(inode, start);
7115                 di->i_size = cpu_to_le64(start);
7116         }
7117
7118         inode->i_blocks = ocfs2_inode_sector_count(inode);
7119         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7120
7121         di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7122         di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7123
7124         ocfs2_journal_dirty(handle, di_bh);
7125
7126 out_commit:
7127         ocfs2_commit_trans(osb, handle);
7128
7129 out:
7130         return ret;
7131 }
7132
7133 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7134 {
7135         /*
7136          * The caller is responsible for completing deallocation
7137          * before freeing the context.
7138          */
7139         if (tc->tc_dealloc.c_first_suballocator != NULL)
7140                 mlog(ML_NOTICE,
7141                      "Truncate completion has non-empty dealloc context\n");
7142
7143         if (tc->tc_last_eb_bh)
7144                 brelse(tc->tc_last_eb_bh);
7145
7146         kfree(tc);
7147 }