ocfs2: Create specific get_extent_tree functions.
[safe/jmp/linux-2.6] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "aops.h"
39 #include "dlmglue.h"
40 #include "extent_map.h"
41 #include "inode.h"
42 #include "journal.h"
43 #include "localalloc.h"
44 #include "suballoc.h"
45 #include "sysfile.h"
46 #include "file.h"
47 #include "super.h"
48 #include "uptodate.h"
49
50 #include "buffer_head_io.h"
51
52 /*
53  * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
54  * the b-tree operations in ocfs2. Now all the b-tree operations are not
55  * limited to ocfs2_dinode only. Any data which need to allocate clusters
56  * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
57  * and operation.
58  *
59  * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
60  * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
61  * functions.
62  * ocfs2_extent_tree_operations abstract the normal operations we do for
63  * the root of extent b-tree.
64  */
65 struct ocfs2_extent_tree;
66
67 struct ocfs2_extent_tree_operations {
68         void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
69                                    u64 blkno);
70         u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
71         void (*eo_update_clusters)(struct inode *inode,
72                                    struct ocfs2_extent_tree *et,
73                                    u32 new_clusters);
74         int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
75
76         /* These are internal to ocfs2_extent_tree and don't have
77          * accessor functions */
78         void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
79         void (*eo_fill_max_leaf_clusters)(struct inode *inode,
80                                           struct ocfs2_extent_tree *et);
81 };
82
83 struct ocfs2_extent_tree {
84         enum ocfs2_extent_tree_type             et_type;
85         struct ocfs2_extent_tree_operations     *et_ops;
86         struct buffer_head                      *et_root_bh;
87         struct ocfs2_extent_list                *et_root_el;
88         void                                    *et_object;
89         unsigned int                            et_max_leaf_clusters;
90 };
91
92 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
93 {
94         struct ocfs2_dinode *di = et->et_object;
95
96         et->et_root_el = &di->id2.i_list;
97 }
98
99 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
100                                          u64 blkno)
101 {
102         struct ocfs2_dinode *di = et->et_object;
103
104         BUG_ON(et->et_type != OCFS2_DINODE_EXTENT);
105         di->i_last_eb_blk = cpu_to_le64(blkno);
106 }
107
108 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
109 {
110         struct ocfs2_dinode *di = et->et_object;
111
112         BUG_ON(et->et_type != OCFS2_DINODE_EXTENT);
113         return le64_to_cpu(di->i_last_eb_blk);
114 }
115
116 static void ocfs2_dinode_update_clusters(struct inode *inode,
117                                          struct ocfs2_extent_tree *et,
118                                          u32 clusters)
119 {
120         struct ocfs2_dinode *di = et->et_object;
121
122         le32_add_cpu(&di->i_clusters, clusters);
123         spin_lock(&OCFS2_I(inode)->ip_lock);
124         OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
125         spin_unlock(&OCFS2_I(inode)->ip_lock);
126 }
127
128 static int ocfs2_dinode_sanity_check(struct inode *inode,
129                                      struct ocfs2_extent_tree *et)
130 {
131         int ret = 0;
132         struct ocfs2_dinode *di;
133
134         BUG_ON(et->et_type != OCFS2_DINODE_EXTENT);
135
136         di = et->et_object;
137         if (!OCFS2_IS_VALID_DINODE(di)) {
138                 ret = -EIO;
139                 ocfs2_error(inode->i_sb,
140                         "Inode %llu has invalid path root",
141                         (unsigned long long)OCFS2_I(inode)->ip_blkno);
142         }
143
144         return ret;
145 }
146
147 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
148         .eo_set_last_eb_blk     = ocfs2_dinode_set_last_eb_blk,
149         .eo_get_last_eb_blk     = ocfs2_dinode_get_last_eb_blk,
150         .eo_update_clusters     = ocfs2_dinode_update_clusters,
151         .eo_sanity_check        = ocfs2_dinode_sanity_check,
152         .eo_fill_root_el        = ocfs2_dinode_fill_root_el,
153 };
154
155 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
156 {
157         struct ocfs2_xattr_value_root *xv = et->et_object;
158
159         et->et_root_el = &xv->xr_list;
160 }
161
162 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
163                                               u64 blkno)
164 {
165         struct ocfs2_xattr_value_root *xv =
166                 (struct ocfs2_xattr_value_root *)et->et_object;
167
168         xv->xr_last_eb_blk = cpu_to_le64(blkno);
169 }
170
171 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
172 {
173         struct ocfs2_xattr_value_root *xv =
174                 (struct ocfs2_xattr_value_root *) et->et_object;
175
176         return le64_to_cpu(xv->xr_last_eb_blk);
177 }
178
179 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
180                                               struct ocfs2_extent_tree *et,
181                                               u32 clusters)
182 {
183         struct ocfs2_xattr_value_root *xv =
184                 (struct ocfs2_xattr_value_root *)et->et_object;
185
186         le32_add_cpu(&xv->xr_clusters, clusters);
187 }
188
189 static int ocfs2_xattr_value_sanity_check(struct inode *inode,
190                                           struct ocfs2_extent_tree *et)
191 {
192         return 0;
193 }
194
195 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
196         .eo_set_last_eb_blk     = ocfs2_xattr_value_set_last_eb_blk,
197         .eo_get_last_eb_blk     = ocfs2_xattr_value_get_last_eb_blk,
198         .eo_update_clusters     = ocfs2_xattr_value_update_clusters,
199         .eo_sanity_check        = ocfs2_xattr_value_sanity_check,
200         .eo_fill_root_el        = ocfs2_xattr_value_fill_root_el,
201 };
202
203 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
204 {
205         struct ocfs2_xattr_block *xb = et->et_object;
206
207         et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
208 }
209
210 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
211                                                     struct ocfs2_extent_tree *et)
212 {
213         et->et_max_leaf_clusters =
214                 ocfs2_clusters_for_bytes(inode->i_sb,
215                                          OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
216 }
217
218 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
219                                              u64 blkno)
220 {
221         struct ocfs2_xattr_block *xb = et->et_object;
222         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
223
224         xt->xt_last_eb_blk = cpu_to_le64(blkno);
225 }
226
227 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
228 {
229         struct ocfs2_xattr_block *xb = et->et_object;
230         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
231
232         return le64_to_cpu(xt->xt_last_eb_blk);
233 }
234
235 static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
236                                              struct ocfs2_extent_tree *et,
237                                              u32 clusters)
238 {
239         struct ocfs2_xattr_block *xb = et->et_object;
240
241         le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
242 }
243
244 static int ocfs2_xattr_tree_sanity_check(struct inode *inode,
245                                          struct ocfs2_extent_tree *et)
246 {
247         return 0;
248 }
249
250 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
251         .eo_set_last_eb_blk     = ocfs2_xattr_tree_set_last_eb_blk,
252         .eo_get_last_eb_blk     = ocfs2_xattr_tree_get_last_eb_blk,
253         .eo_update_clusters     = ocfs2_xattr_tree_update_clusters,
254         .eo_sanity_check        = ocfs2_xattr_tree_sanity_check,
255         .eo_fill_root_el        = ocfs2_xattr_tree_fill_root_el,
256         .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
257 };
258
259 static void __ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
260                                     struct inode *inode,
261                                     struct buffer_head *bh,
262                                     void *obj,
263                                     enum ocfs2_extent_tree_type et_type,
264                                     struct ocfs2_extent_tree_operations *ops)
265 {
266         et->et_type = et_type;
267         et->et_ops = ops;
268         get_bh(bh);
269         et->et_root_bh = bh;
270         if (!obj)
271                 obj = (void *)bh->b_data;
272         et->et_object = obj;
273
274         et->et_ops->eo_fill_root_el(et);
275         if (!et->et_ops->eo_fill_max_leaf_clusters)
276                 et->et_max_leaf_clusters = 0;
277         else
278                 et->et_ops->eo_fill_max_leaf_clusters(inode, et);
279 }
280
281 static void ocfs2_get_dinode_extent_tree(struct ocfs2_extent_tree *et,
282                                          struct inode *inode,
283                                          struct buffer_head *bh)
284 {
285         __ocfs2_get_extent_tree(et, inode, bh, NULL, OCFS2_DINODE_EXTENT,
286                                 &ocfs2_dinode_et_ops);
287 }
288
289 static void ocfs2_get_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
290                                              struct inode *inode,
291                                              struct buffer_head *bh)
292 {
293         __ocfs2_get_extent_tree(et, inode, bh, NULL,
294                                 OCFS2_XATTR_TREE_EXTENT,
295                                 &ocfs2_xattr_tree_et_ops);
296 }
297
298 static void ocfs2_get_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
299                                              struct inode *inode,
300                                              struct buffer_head *bh,
301                                              struct ocfs2_xattr_value_root *xv)
302 {
303         __ocfs2_get_extent_tree(et, inode, bh, xv,
304                                 OCFS2_XATTR_VALUE_EXTENT,
305                                 &ocfs2_xattr_value_et_ops);
306 }
307
308 static void ocfs2_get_extent_tree(struct ocfs2_extent_tree *et,
309                                   struct inode *inode,
310                                   struct buffer_head *bh,
311                                   enum ocfs2_extent_tree_type et_type,
312                                   void *obj)
313 {
314         if (et_type == OCFS2_DINODE_EXTENT)
315                 ocfs2_get_dinode_extent_tree(et, inode, bh);
316         else if (et_type == OCFS2_XATTR_VALUE_EXTENT)
317                 ocfs2_get_xattr_tree_extent_tree(et, inode, bh);
318         else if (et_type == OCFS2_XATTR_TREE_EXTENT)
319                 ocfs2_get_xattr_value_extent_tree(et, inode, bh, obj);
320         else
321                 BUG();
322 }
323
324 static void ocfs2_put_extent_tree(struct ocfs2_extent_tree *et)
325 {
326         brelse(et->et_root_bh);
327 }
328
329 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
330                                             u64 new_last_eb_blk)
331 {
332         et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
333 }
334
335 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
336 {
337         return et->et_ops->eo_get_last_eb_blk(et);
338 }
339
340 static inline void ocfs2_et_update_clusters(struct inode *inode,
341                                             struct ocfs2_extent_tree *et,
342                                             u32 clusters)
343 {
344         et->et_ops->eo_update_clusters(inode, et, clusters);
345 }
346
347 static inline int ocfs2_et_sanity_check(struct inode *inode,
348                                         struct ocfs2_extent_tree *et)
349 {
350         return et->et_ops->eo_sanity_check(inode, et);
351 }
352
353 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
354 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
355                                          struct ocfs2_extent_block *eb);
356
357 /*
358  * Structures which describe a path through a btree, and functions to
359  * manipulate them.
360  *
361  * The idea here is to be as generic as possible with the tree
362  * manipulation code.
363  */
364 struct ocfs2_path_item {
365         struct buffer_head              *bh;
366         struct ocfs2_extent_list        *el;
367 };
368
369 #define OCFS2_MAX_PATH_DEPTH    5
370
371 struct ocfs2_path {
372         int                     p_tree_depth;
373         struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
374 };
375
376 #define path_root_bh(_path) ((_path)->p_node[0].bh)
377 #define path_root_el(_path) ((_path)->p_node[0].el)
378 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
379 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
380 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
381
382 /*
383  * Reset the actual path elements so that we can re-use the structure
384  * to build another path. Generally, this involves freeing the buffer
385  * heads.
386  */
387 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
388 {
389         int i, start = 0, depth = 0;
390         struct ocfs2_path_item *node;
391
392         if (keep_root)
393                 start = 1;
394
395         for(i = start; i < path_num_items(path); i++) {
396                 node = &path->p_node[i];
397
398                 brelse(node->bh);
399                 node->bh = NULL;
400                 node->el = NULL;
401         }
402
403         /*
404          * Tree depth may change during truncate, or insert. If we're
405          * keeping the root extent list, then make sure that our path
406          * structure reflects the proper depth.
407          */
408         if (keep_root)
409                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
410
411         path->p_tree_depth = depth;
412 }
413
414 static void ocfs2_free_path(struct ocfs2_path *path)
415 {
416         if (path) {
417                 ocfs2_reinit_path(path, 0);
418                 kfree(path);
419         }
420 }
421
422 /*
423  * All the elements of src into dest. After this call, src could be freed
424  * without affecting dest.
425  *
426  * Both paths should have the same root. Any non-root elements of dest
427  * will be freed.
428  */
429 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
430 {
431         int i;
432
433         BUG_ON(path_root_bh(dest) != path_root_bh(src));
434         BUG_ON(path_root_el(dest) != path_root_el(src));
435
436         ocfs2_reinit_path(dest, 1);
437
438         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
439                 dest->p_node[i].bh = src->p_node[i].bh;
440                 dest->p_node[i].el = src->p_node[i].el;
441
442                 if (dest->p_node[i].bh)
443                         get_bh(dest->p_node[i].bh);
444         }
445 }
446
447 /*
448  * Make the *dest path the same as src and re-initialize src path to
449  * have a root only.
450  */
451 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
452 {
453         int i;
454
455         BUG_ON(path_root_bh(dest) != path_root_bh(src));
456
457         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
458                 brelse(dest->p_node[i].bh);
459
460                 dest->p_node[i].bh = src->p_node[i].bh;
461                 dest->p_node[i].el = src->p_node[i].el;
462
463                 src->p_node[i].bh = NULL;
464                 src->p_node[i].el = NULL;
465         }
466 }
467
468 /*
469  * Insert an extent block at given index.
470  *
471  * This will not take an additional reference on eb_bh.
472  */
473 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
474                                         struct buffer_head *eb_bh)
475 {
476         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
477
478         /*
479          * Right now, no root bh is an extent block, so this helps
480          * catch code errors with dinode trees. The assertion can be
481          * safely removed if we ever need to insert extent block
482          * structures at the root.
483          */
484         BUG_ON(index == 0);
485
486         path->p_node[index].bh = eb_bh;
487         path->p_node[index].el = &eb->h_list;
488 }
489
490 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
491                                          struct ocfs2_extent_list *root_el)
492 {
493         struct ocfs2_path *path;
494
495         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
496
497         path = kzalloc(sizeof(*path), GFP_NOFS);
498         if (path) {
499                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
500                 get_bh(root_bh);
501                 path_root_bh(path) = root_bh;
502                 path_root_el(path) = root_el;
503         }
504
505         return path;
506 }
507
508 /*
509  * Convenience function to journal all components in a path.
510  */
511 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
512                                      struct ocfs2_path *path)
513 {
514         int i, ret = 0;
515
516         if (!path)
517                 goto out;
518
519         for(i = 0; i < path_num_items(path); i++) {
520                 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
521                                            OCFS2_JOURNAL_ACCESS_WRITE);
522                 if (ret < 0) {
523                         mlog_errno(ret);
524                         goto out;
525                 }
526         }
527
528 out:
529         return ret;
530 }
531
532 /*
533  * Return the index of the extent record which contains cluster #v_cluster.
534  * -1 is returned if it was not found.
535  *
536  * Should work fine on interior and exterior nodes.
537  */
538 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
539 {
540         int ret = -1;
541         int i;
542         struct ocfs2_extent_rec *rec;
543         u32 rec_end, rec_start, clusters;
544
545         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
546                 rec = &el->l_recs[i];
547
548                 rec_start = le32_to_cpu(rec->e_cpos);
549                 clusters = ocfs2_rec_clusters(el, rec);
550
551                 rec_end = rec_start + clusters;
552
553                 if (v_cluster >= rec_start && v_cluster < rec_end) {
554                         ret = i;
555                         break;
556                 }
557         }
558
559         return ret;
560 }
561
562 enum ocfs2_contig_type {
563         CONTIG_NONE = 0,
564         CONTIG_LEFT,
565         CONTIG_RIGHT,
566         CONTIG_LEFTRIGHT,
567 };
568
569
570 /*
571  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
572  * ocfs2_extent_contig only work properly against leaf nodes!
573  */
574 static int ocfs2_block_extent_contig(struct super_block *sb,
575                                      struct ocfs2_extent_rec *ext,
576                                      u64 blkno)
577 {
578         u64 blk_end = le64_to_cpu(ext->e_blkno);
579
580         blk_end += ocfs2_clusters_to_blocks(sb,
581                                     le16_to_cpu(ext->e_leaf_clusters));
582
583         return blkno == blk_end;
584 }
585
586 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
587                                   struct ocfs2_extent_rec *right)
588 {
589         u32 left_range;
590
591         left_range = le32_to_cpu(left->e_cpos) +
592                 le16_to_cpu(left->e_leaf_clusters);
593
594         return (left_range == le32_to_cpu(right->e_cpos));
595 }
596
597 static enum ocfs2_contig_type
598         ocfs2_extent_contig(struct inode *inode,
599                             struct ocfs2_extent_rec *ext,
600                             struct ocfs2_extent_rec *insert_rec)
601 {
602         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
603
604         /*
605          * Refuse to coalesce extent records with different flag
606          * fields - we don't want to mix unwritten extents with user
607          * data.
608          */
609         if (ext->e_flags != insert_rec->e_flags)
610                 return CONTIG_NONE;
611
612         if (ocfs2_extents_adjacent(ext, insert_rec) &&
613             ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
614                         return CONTIG_RIGHT;
615
616         blkno = le64_to_cpu(ext->e_blkno);
617         if (ocfs2_extents_adjacent(insert_rec, ext) &&
618             ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
619                 return CONTIG_LEFT;
620
621         return CONTIG_NONE;
622 }
623
624 /*
625  * NOTE: We can have pretty much any combination of contiguousness and
626  * appending.
627  *
628  * The usefulness of APPEND_TAIL is more in that it lets us know that
629  * we'll have to update the path to that leaf.
630  */
631 enum ocfs2_append_type {
632         APPEND_NONE = 0,
633         APPEND_TAIL,
634 };
635
636 enum ocfs2_split_type {
637         SPLIT_NONE = 0,
638         SPLIT_LEFT,
639         SPLIT_RIGHT,
640 };
641
642 struct ocfs2_insert_type {
643         enum ocfs2_split_type   ins_split;
644         enum ocfs2_append_type  ins_appending;
645         enum ocfs2_contig_type  ins_contig;
646         int                     ins_contig_index;
647         int                     ins_tree_depth;
648 };
649
650 struct ocfs2_merge_ctxt {
651         enum ocfs2_contig_type  c_contig_type;
652         int                     c_has_empty_extent;
653         int                     c_split_covers_rec;
654 };
655
656 /*
657  * How many free extents have we got before we need more meta data?
658  */
659 int ocfs2_num_free_extents(struct ocfs2_super *osb,
660                            struct inode *inode,
661                            struct buffer_head *root_bh,
662                            enum ocfs2_extent_tree_type type,
663                            void *obj)
664 {
665         int retval;
666         struct ocfs2_extent_list *el = NULL;
667         struct ocfs2_extent_block *eb;
668         struct buffer_head *eb_bh = NULL;
669         u64 last_eb_blk = 0;
670         struct ocfs2_extent_tree et;
671
672         mlog_entry_void();
673
674         ocfs2_get_extent_tree(&et, inode, root_bh, type, obj);
675         el = et.et_root_el;
676         last_eb_blk = ocfs2_et_get_last_eb_blk(&et);
677
678         if (last_eb_blk) {
679                 retval = ocfs2_read_block(osb, last_eb_blk,
680                                           &eb_bh, OCFS2_BH_CACHED, inode);
681                 if (retval < 0) {
682                         mlog_errno(retval);
683                         goto bail;
684                 }
685                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
686                 el = &eb->h_list;
687         }
688
689         BUG_ON(el->l_tree_depth != 0);
690
691         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
692 bail:
693         if (eb_bh)
694                 brelse(eb_bh);
695
696         ocfs2_put_extent_tree(&et);
697         mlog_exit(retval);
698         return retval;
699 }
700
701 /* expects array to already be allocated
702  *
703  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
704  * l_count for you
705  */
706 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
707                                      handle_t *handle,
708                                      struct inode *inode,
709                                      int wanted,
710                                      struct ocfs2_alloc_context *meta_ac,
711                                      struct buffer_head *bhs[])
712 {
713         int count, status, i;
714         u16 suballoc_bit_start;
715         u32 num_got;
716         u64 first_blkno;
717         struct ocfs2_extent_block *eb;
718
719         mlog_entry_void();
720
721         count = 0;
722         while (count < wanted) {
723                 status = ocfs2_claim_metadata(osb,
724                                               handle,
725                                               meta_ac,
726                                               wanted - count,
727                                               &suballoc_bit_start,
728                                               &num_got,
729                                               &first_blkno);
730                 if (status < 0) {
731                         mlog_errno(status);
732                         goto bail;
733                 }
734
735                 for(i = count;  i < (num_got + count); i++) {
736                         bhs[i] = sb_getblk(osb->sb, first_blkno);
737                         if (bhs[i] == NULL) {
738                                 status = -EIO;
739                                 mlog_errno(status);
740                                 goto bail;
741                         }
742                         ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
743
744                         status = ocfs2_journal_access(handle, inode, bhs[i],
745                                                       OCFS2_JOURNAL_ACCESS_CREATE);
746                         if (status < 0) {
747                                 mlog_errno(status);
748                                 goto bail;
749                         }
750
751                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
752                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
753                         /* Ok, setup the minimal stuff here. */
754                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
755                         eb->h_blkno = cpu_to_le64(first_blkno);
756                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
757                         eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
758                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
759                         eb->h_list.l_count =
760                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
761
762                         suballoc_bit_start++;
763                         first_blkno++;
764
765                         /* We'll also be dirtied by the caller, so
766                          * this isn't absolutely necessary. */
767                         status = ocfs2_journal_dirty(handle, bhs[i]);
768                         if (status < 0) {
769                                 mlog_errno(status);
770                                 goto bail;
771                         }
772                 }
773
774                 count += num_got;
775         }
776
777         status = 0;
778 bail:
779         if (status < 0) {
780                 for(i = 0; i < wanted; i++) {
781                         if (bhs[i])
782                                 brelse(bhs[i]);
783                         bhs[i] = NULL;
784                 }
785         }
786         mlog_exit(status);
787         return status;
788 }
789
790 /*
791  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
792  *
793  * Returns the sum of the rightmost extent rec logical offset and
794  * cluster count.
795  *
796  * ocfs2_add_branch() uses this to determine what logical cluster
797  * value should be populated into the leftmost new branch records.
798  *
799  * ocfs2_shift_tree_depth() uses this to determine the # clusters
800  * value for the new topmost tree record.
801  */
802 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
803 {
804         int i;
805
806         i = le16_to_cpu(el->l_next_free_rec) - 1;
807
808         return le32_to_cpu(el->l_recs[i].e_cpos) +
809                 ocfs2_rec_clusters(el, &el->l_recs[i]);
810 }
811
812 /*
813  * Add an entire tree branch to our inode. eb_bh is the extent block
814  * to start at, if we don't want to start the branch at the dinode
815  * structure.
816  *
817  * last_eb_bh is required as we have to update it's next_leaf pointer
818  * for the new last extent block.
819  *
820  * the new branch will be 'empty' in the sense that every block will
821  * contain a single record with cluster count == 0.
822  */
823 static int ocfs2_add_branch(struct ocfs2_super *osb,
824                             handle_t *handle,
825                             struct inode *inode,
826                             struct ocfs2_extent_tree *et,
827                             struct buffer_head *eb_bh,
828                             struct buffer_head **last_eb_bh,
829                             struct ocfs2_alloc_context *meta_ac)
830 {
831         int status, new_blocks, i;
832         u64 next_blkno, new_last_eb_blk;
833         struct buffer_head *bh;
834         struct buffer_head **new_eb_bhs = NULL;
835         struct ocfs2_extent_block *eb;
836         struct ocfs2_extent_list  *eb_el;
837         struct ocfs2_extent_list  *el;
838         u32 new_cpos;
839
840         mlog_entry_void();
841
842         BUG_ON(!last_eb_bh || !*last_eb_bh);
843
844         if (eb_bh) {
845                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
846                 el = &eb->h_list;
847         } else
848                 el = et->et_root_el;
849
850         /* we never add a branch to a leaf. */
851         BUG_ON(!el->l_tree_depth);
852
853         new_blocks = le16_to_cpu(el->l_tree_depth);
854
855         /* allocate the number of new eb blocks we need */
856         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
857                              GFP_KERNEL);
858         if (!new_eb_bhs) {
859                 status = -ENOMEM;
860                 mlog_errno(status);
861                 goto bail;
862         }
863
864         status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
865                                            meta_ac, new_eb_bhs);
866         if (status < 0) {
867                 mlog_errno(status);
868                 goto bail;
869         }
870
871         eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
872         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
873
874         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
875          * linked with the rest of the tree.
876          * conversly, new_eb_bhs[0] is the new bottommost leaf.
877          *
878          * when we leave the loop, new_last_eb_blk will point to the
879          * newest leaf, and next_blkno will point to the topmost extent
880          * block. */
881         next_blkno = new_last_eb_blk = 0;
882         for(i = 0; i < new_blocks; i++) {
883                 bh = new_eb_bhs[i];
884                 eb = (struct ocfs2_extent_block *) bh->b_data;
885                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
886                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
887                         status = -EIO;
888                         goto bail;
889                 }
890                 eb_el = &eb->h_list;
891
892                 status = ocfs2_journal_access(handle, inode, bh,
893                                               OCFS2_JOURNAL_ACCESS_CREATE);
894                 if (status < 0) {
895                         mlog_errno(status);
896                         goto bail;
897                 }
898
899                 eb->h_next_leaf_blk = 0;
900                 eb_el->l_tree_depth = cpu_to_le16(i);
901                 eb_el->l_next_free_rec = cpu_to_le16(1);
902                 /*
903                  * This actually counts as an empty extent as
904                  * c_clusters == 0
905                  */
906                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
907                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
908                 /*
909                  * eb_el isn't always an interior node, but even leaf
910                  * nodes want a zero'd flags and reserved field so
911                  * this gets the whole 32 bits regardless of use.
912                  */
913                 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
914                 if (!eb_el->l_tree_depth)
915                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
916
917                 status = ocfs2_journal_dirty(handle, bh);
918                 if (status < 0) {
919                         mlog_errno(status);
920                         goto bail;
921                 }
922
923                 next_blkno = le64_to_cpu(eb->h_blkno);
924         }
925
926         /* This is a bit hairy. We want to update up to three blocks
927          * here without leaving any of them in an inconsistent state
928          * in case of error. We don't have to worry about
929          * journal_dirty erroring as it won't unless we've aborted the
930          * handle (in which case we would never be here) so reserving
931          * the write with journal_access is all we need to do. */
932         status = ocfs2_journal_access(handle, inode, *last_eb_bh,
933                                       OCFS2_JOURNAL_ACCESS_WRITE);
934         if (status < 0) {
935                 mlog_errno(status);
936                 goto bail;
937         }
938         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
939                                       OCFS2_JOURNAL_ACCESS_WRITE);
940         if (status < 0) {
941                 mlog_errno(status);
942                 goto bail;
943         }
944         if (eb_bh) {
945                 status = ocfs2_journal_access(handle, inode, eb_bh,
946                                               OCFS2_JOURNAL_ACCESS_WRITE);
947                 if (status < 0) {
948                         mlog_errno(status);
949                         goto bail;
950                 }
951         }
952
953         /* Link the new branch into the rest of the tree (el will
954          * either be on the root_bh, or the extent block passed in. */
955         i = le16_to_cpu(el->l_next_free_rec);
956         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
957         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
958         el->l_recs[i].e_int_clusters = 0;
959         le16_add_cpu(&el->l_next_free_rec, 1);
960
961         /* fe needs a new last extent block pointer, as does the
962          * next_leaf on the previously last-extent-block. */
963         ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
964
965         eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
966         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
967
968         status = ocfs2_journal_dirty(handle, *last_eb_bh);
969         if (status < 0)
970                 mlog_errno(status);
971         status = ocfs2_journal_dirty(handle, et->et_root_bh);
972         if (status < 0)
973                 mlog_errno(status);
974         if (eb_bh) {
975                 status = ocfs2_journal_dirty(handle, eb_bh);
976                 if (status < 0)
977                         mlog_errno(status);
978         }
979
980         /*
981          * Some callers want to track the rightmost leaf so pass it
982          * back here.
983          */
984         brelse(*last_eb_bh);
985         get_bh(new_eb_bhs[0]);
986         *last_eb_bh = new_eb_bhs[0];
987
988         status = 0;
989 bail:
990         if (new_eb_bhs) {
991                 for (i = 0; i < new_blocks; i++)
992                         if (new_eb_bhs[i])
993                                 brelse(new_eb_bhs[i]);
994                 kfree(new_eb_bhs);
995         }
996
997         mlog_exit(status);
998         return status;
999 }
1000
1001 /*
1002  * adds another level to the allocation tree.
1003  * returns back the new extent block so you can add a branch to it
1004  * after this call.
1005  */
1006 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1007                                   handle_t *handle,
1008                                   struct inode *inode,
1009                                   struct ocfs2_extent_tree *et,
1010                                   struct ocfs2_alloc_context *meta_ac,
1011                                   struct buffer_head **ret_new_eb_bh)
1012 {
1013         int status, i;
1014         u32 new_clusters;
1015         struct buffer_head *new_eb_bh = NULL;
1016         struct ocfs2_extent_block *eb;
1017         struct ocfs2_extent_list  *root_el;
1018         struct ocfs2_extent_list  *eb_el;
1019
1020         mlog_entry_void();
1021
1022         status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
1023                                            &new_eb_bh);
1024         if (status < 0) {
1025                 mlog_errno(status);
1026                 goto bail;
1027         }
1028
1029         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1030         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1031                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1032                 status = -EIO;
1033                 goto bail;
1034         }
1035
1036         eb_el = &eb->h_list;
1037         root_el = et->et_root_el;
1038
1039         status = ocfs2_journal_access(handle, inode, new_eb_bh,
1040                                       OCFS2_JOURNAL_ACCESS_CREATE);
1041         if (status < 0) {
1042                 mlog_errno(status);
1043                 goto bail;
1044         }
1045
1046         /* copy the root extent list data into the new extent block */
1047         eb_el->l_tree_depth = root_el->l_tree_depth;
1048         eb_el->l_next_free_rec = root_el->l_next_free_rec;
1049         for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1050                 eb_el->l_recs[i] = root_el->l_recs[i];
1051
1052         status = ocfs2_journal_dirty(handle, new_eb_bh);
1053         if (status < 0) {
1054                 mlog_errno(status);
1055                 goto bail;
1056         }
1057
1058         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
1059                                       OCFS2_JOURNAL_ACCESS_WRITE);
1060         if (status < 0) {
1061                 mlog_errno(status);
1062                 goto bail;
1063         }
1064
1065         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1066
1067         /* update root_bh now */
1068         le16_add_cpu(&root_el->l_tree_depth, 1);
1069         root_el->l_recs[0].e_cpos = 0;
1070         root_el->l_recs[0].e_blkno = eb->h_blkno;
1071         root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1072         for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1073                 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1074         root_el->l_next_free_rec = cpu_to_le16(1);
1075
1076         /* If this is our 1st tree depth shift, then last_eb_blk
1077          * becomes the allocated extent block */
1078         if (root_el->l_tree_depth == cpu_to_le16(1))
1079                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1080
1081         status = ocfs2_journal_dirty(handle, et->et_root_bh);
1082         if (status < 0) {
1083                 mlog_errno(status);
1084                 goto bail;
1085         }
1086
1087         *ret_new_eb_bh = new_eb_bh;
1088         new_eb_bh = NULL;
1089         status = 0;
1090 bail:
1091         if (new_eb_bh)
1092                 brelse(new_eb_bh);
1093
1094         mlog_exit(status);
1095         return status;
1096 }
1097
1098 /*
1099  * Should only be called when there is no space left in any of the
1100  * leaf nodes. What we want to do is find the lowest tree depth
1101  * non-leaf extent block with room for new records. There are three
1102  * valid results of this search:
1103  *
1104  * 1) a lowest extent block is found, then we pass it back in
1105  *    *lowest_eb_bh and return '0'
1106  *
1107  * 2) the search fails to find anything, but the root_el has room. We
1108  *    pass NULL back in *lowest_eb_bh, but still return '0'
1109  *
1110  * 3) the search fails to find anything AND the root_el is full, in
1111  *    which case we return > 0
1112  *
1113  * return status < 0 indicates an error.
1114  */
1115 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1116                                     struct inode *inode,
1117                                     struct ocfs2_extent_tree *et,
1118                                     struct buffer_head **target_bh)
1119 {
1120         int status = 0, i;
1121         u64 blkno;
1122         struct ocfs2_extent_block *eb;
1123         struct ocfs2_extent_list  *el;
1124         struct buffer_head *bh = NULL;
1125         struct buffer_head *lowest_bh = NULL;
1126
1127         mlog_entry_void();
1128
1129         *target_bh = NULL;
1130
1131         el = et->et_root_el;
1132
1133         while(le16_to_cpu(el->l_tree_depth) > 1) {
1134                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1135                         ocfs2_error(inode->i_sb, "Dinode %llu has empty "
1136                                     "extent list (next_free_rec == 0)",
1137                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
1138                         status = -EIO;
1139                         goto bail;
1140                 }
1141                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1142                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1143                 if (!blkno) {
1144                         ocfs2_error(inode->i_sb, "Dinode %llu has extent "
1145                                     "list where extent # %d has no physical "
1146                                     "block start",
1147                                     (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
1148                         status = -EIO;
1149                         goto bail;
1150                 }
1151
1152                 if (bh) {
1153                         brelse(bh);
1154                         bh = NULL;
1155                 }
1156
1157                 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
1158                                           inode);
1159                 if (status < 0) {
1160                         mlog_errno(status);
1161                         goto bail;
1162                 }
1163
1164                 eb = (struct ocfs2_extent_block *) bh->b_data;
1165                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1166                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1167                         status = -EIO;
1168                         goto bail;
1169                 }
1170                 el = &eb->h_list;
1171
1172                 if (le16_to_cpu(el->l_next_free_rec) <
1173                     le16_to_cpu(el->l_count)) {
1174                         if (lowest_bh)
1175                                 brelse(lowest_bh);
1176                         lowest_bh = bh;
1177                         get_bh(lowest_bh);
1178                 }
1179         }
1180
1181         /* If we didn't find one and the fe doesn't have any room,
1182          * then return '1' */
1183         el = et->et_root_el;
1184         if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1185                 status = 1;
1186
1187         *target_bh = lowest_bh;
1188 bail:
1189         if (bh)
1190                 brelse(bh);
1191
1192         mlog_exit(status);
1193         return status;
1194 }
1195
1196 /*
1197  * Grow a b-tree so that it has more records.
1198  *
1199  * We might shift the tree depth in which case existing paths should
1200  * be considered invalid.
1201  *
1202  * Tree depth after the grow is returned via *final_depth.
1203  *
1204  * *last_eb_bh will be updated by ocfs2_add_branch().
1205  */
1206 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1207                            struct ocfs2_extent_tree *et, int *final_depth,
1208                            struct buffer_head **last_eb_bh,
1209                            struct ocfs2_alloc_context *meta_ac)
1210 {
1211         int ret, shift;
1212         struct ocfs2_extent_list *el = et->et_root_el;
1213         int depth = le16_to_cpu(el->l_tree_depth);
1214         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1215         struct buffer_head *bh = NULL;
1216
1217         BUG_ON(meta_ac == NULL);
1218
1219         shift = ocfs2_find_branch_target(osb, inode, et, &bh);
1220         if (shift < 0) {
1221                 ret = shift;
1222                 mlog_errno(ret);
1223                 goto out;
1224         }
1225
1226         /* We traveled all the way to the bottom of the allocation tree
1227          * and didn't find room for any more extents - we need to add
1228          * another tree level */
1229         if (shift) {
1230                 BUG_ON(bh);
1231                 mlog(0, "need to shift tree depth (current = %d)\n", depth);
1232
1233                 /* ocfs2_shift_tree_depth will return us a buffer with
1234                  * the new extent block (so we can pass that to
1235                  * ocfs2_add_branch). */
1236                 ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
1237                                              meta_ac, &bh);
1238                 if (ret < 0) {
1239                         mlog_errno(ret);
1240                         goto out;
1241                 }
1242                 depth++;
1243                 if (depth == 1) {
1244                         /*
1245                          * Special case: we have room now if we shifted from
1246                          * tree_depth 0, so no more work needs to be done.
1247                          *
1248                          * We won't be calling add_branch, so pass
1249                          * back *last_eb_bh as the new leaf. At depth
1250                          * zero, it should always be null so there's
1251                          * no reason to brelse.
1252                          */
1253                         BUG_ON(*last_eb_bh);
1254                         get_bh(bh);
1255                         *last_eb_bh = bh;
1256                         goto out;
1257                 }
1258         }
1259
1260         /* call ocfs2_add_branch to add the final part of the tree with
1261          * the new data. */
1262         mlog(0, "add branch. bh = %p\n", bh);
1263         ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
1264                                meta_ac);
1265         if (ret < 0) {
1266                 mlog_errno(ret);
1267                 goto out;
1268         }
1269
1270 out:
1271         if (final_depth)
1272                 *final_depth = depth;
1273         brelse(bh);
1274         return ret;
1275 }
1276
1277 /*
1278  * This function will discard the rightmost extent record.
1279  */
1280 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1281 {
1282         int next_free = le16_to_cpu(el->l_next_free_rec);
1283         int count = le16_to_cpu(el->l_count);
1284         unsigned int num_bytes;
1285
1286         BUG_ON(!next_free);
1287         /* This will cause us to go off the end of our extent list. */
1288         BUG_ON(next_free >= count);
1289
1290         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1291
1292         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1293 }
1294
1295 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1296                               struct ocfs2_extent_rec *insert_rec)
1297 {
1298         int i, insert_index, next_free, has_empty, num_bytes;
1299         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1300         struct ocfs2_extent_rec *rec;
1301
1302         next_free = le16_to_cpu(el->l_next_free_rec);
1303         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1304
1305         BUG_ON(!next_free);
1306
1307         /* The tree code before us didn't allow enough room in the leaf. */
1308         BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1309
1310         /*
1311          * The easiest way to approach this is to just remove the
1312          * empty extent and temporarily decrement next_free.
1313          */
1314         if (has_empty) {
1315                 /*
1316                  * If next_free was 1 (only an empty extent), this
1317                  * loop won't execute, which is fine. We still want
1318                  * the decrement above to happen.
1319                  */
1320                 for(i = 0; i < (next_free - 1); i++)
1321                         el->l_recs[i] = el->l_recs[i+1];
1322
1323                 next_free--;
1324         }
1325
1326         /*
1327          * Figure out what the new record index should be.
1328          */
1329         for(i = 0; i < next_free; i++) {
1330                 rec = &el->l_recs[i];
1331
1332                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
1333                         break;
1334         }
1335         insert_index = i;
1336
1337         mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1338              insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1339
1340         BUG_ON(insert_index < 0);
1341         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1342         BUG_ON(insert_index > next_free);
1343
1344         /*
1345          * No need to memmove if we're just adding to the tail.
1346          */
1347         if (insert_index != next_free) {
1348                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
1349
1350                 num_bytes = next_free - insert_index;
1351                 num_bytes *= sizeof(struct ocfs2_extent_rec);
1352                 memmove(&el->l_recs[insert_index + 1],
1353                         &el->l_recs[insert_index],
1354                         num_bytes);
1355         }
1356
1357         /*
1358          * Either we had an empty extent, and need to re-increment or
1359          * there was no empty extent on a non full rightmost leaf node,
1360          * in which case we still need to increment.
1361          */
1362         next_free++;
1363         el->l_next_free_rec = cpu_to_le16(next_free);
1364         /*
1365          * Make sure none of the math above just messed up our tree.
1366          */
1367         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1368
1369         el->l_recs[insert_index] = *insert_rec;
1370
1371 }
1372
1373 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1374 {
1375         int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1376
1377         BUG_ON(num_recs == 0);
1378
1379         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1380                 num_recs--;
1381                 size = num_recs * sizeof(struct ocfs2_extent_rec);
1382                 memmove(&el->l_recs[0], &el->l_recs[1], size);
1383                 memset(&el->l_recs[num_recs], 0,
1384                        sizeof(struct ocfs2_extent_rec));
1385                 el->l_next_free_rec = cpu_to_le16(num_recs);
1386         }
1387 }
1388
1389 /*
1390  * Create an empty extent record .
1391  *
1392  * l_next_free_rec may be updated.
1393  *
1394  * If an empty extent already exists do nothing.
1395  */
1396 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1397 {
1398         int next_free = le16_to_cpu(el->l_next_free_rec);
1399
1400         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1401
1402         if (next_free == 0)
1403                 goto set_and_inc;
1404
1405         if (ocfs2_is_empty_extent(&el->l_recs[0]))
1406                 return;
1407
1408         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1409                         "Asked to create an empty extent in a full list:\n"
1410                         "count = %u, tree depth = %u",
1411                         le16_to_cpu(el->l_count),
1412                         le16_to_cpu(el->l_tree_depth));
1413
1414         ocfs2_shift_records_right(el);
1415
1416 set_and_inc:
1417         le16_add_cpu(&el->l_next_free_rec, 1);
1418         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1419 }
1420
1421 /*
1422  * For a rotation which involves two leaf nodes, the "root node" is
1423  * the lowest level tree node which contains a path to both leafs. This
1424  * resulting set of information can be used to form a complete "subtree"
1425  *
1426  * This function is passed two full paths from the dinode down to a
1427  * pair of adjacent leaves. It's task is to figure out which path
1428  * index contains the subtree root - this can be the root index itself
1429  * in a worst-case rotation.
1430  *
1431  * The array index of the subtree root is passed back.
1432  */
1433 static int ocfs2_find_subtree_root(struct inode *inode,
1434                                    struct ocfs2_path *left,
1435                                    struct ocfs2_path *right)
1436 {
1437         int i = 0;
1438
1439         /*
1440          * Check that the caller passed in two paths from the same tree.
1441          */
1442         BUG_ON(path_root_bh(left) != path_root_bh(right));
1443
1444         do {
1445                 i++;
1446
1447                 /*
1448                  * The caller didn't pass two adjacent paths.
1449                  */
1450                 mlog_bug_on_msg(i > left->p_tree_depth,
1451                                 "Inode %lu, left depth %u, right depth %u\n"
1452                                 "left leaf blk %llu, right leaf blk %llu\n",
1453                                 inode->i_ino, left->p_tree_depth,
1454                                 right->p_tree_depth,
1455                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1456                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1457         } while (left->p_node[i].bh->b_blocknr ==
1458                  right->p_node[i].bh->b_blocknr);
1459
1460         return i - 1;
1461 }
1462
1463 typedef void (path_insert_t)(void *, struct buffer_head *);
1464
1465 /*
1466  * Traverse a btree path in search of cpos, starting at root_el.
1467  *
1468  * This code can be called with a cpos larger than the tree, in which
1469  * case it will return the rightmost path.
1470  */
1471 static int __ocfs2_find_path(struct inode *inode,
1472                              struct ocfs2_extent_list *root_el, u32 cpos,
1473                              path_insert_t *func, void *data)
1474 {
1475         int i, ret = 0;
1476         u32 range;
1477         u64 blkno;
1478         struct buffer_head *bh = NULL;
1479         struct ocfs2_extent_block *eb;
1480         struct ocfs2_extent_list *el;
1481         struct ocfs2_extent_rec *rec;
1482         struct ocfs2_inode_info *oi = OCFS2_I(inode);
1483
1484         el = root_el;
1485         while (el->l_tree_depth) {
1486                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1487                         ocfs2_error(inode->i_sb,
1488                                     "Inode %llu has empty extent list at "
1489                                     "depth %u\n",
1490                                     (unsigned long long)oi->ip_blkno,
1491                                     le16_to_cpu(el->l_tree_depth));
1492                         ret = -EROFS;
1493                         goto out;
1494
1495                 }
1496
1497                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1498                         rec = &el->l_recs[i];
1499
1500                         /*
1501                          * In the case that cpos is off the allocation
1502                          * tree, this should just wind up returning the
1503                          * rightmost record.
1504                          */
1505                         range = le32_to_cpu(rec->e_cpos) +
1506                                 ocfs2_rec_clusters(el, rec);
1507                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1508                             break;
1509                 }
1510
1511                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1512                 if (blkno == 0) {
1513                         ocfs2_error(inode->i_sb,
1514                                     "Inode %llu has bad blkno in extent list "
1515                                     "at depth %u (index %d)\n",
1516                                     (unsigned long long)oi->ip_blkno,
1517                                     le16_to_cpu(el->l_tree_depth), i);
1518                         ret = -EROFS;
1519                         goto out;
1520                 }
1521
1522                 brelse(bh);
1523                 bh = NULL;
1524                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1525                                        &bh, OCFS2_BH_CACHED, inode);
1526                 if (ret) {
1527                         mlog_errno(ret);
1528                         goto out;
1529                 }
1530
1531                 eb = (struct ocfs2_extent_block *) bh->b_data;
1532                 el = &eb->h_list;
1533                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1534                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1535                         ret = -EIO;
1536                         goto out;
1537                 }
1538
1539                 if (le16_to_cpu(el->l_next_free_rec) >
1540                     le16_to_cpu(el->l_count)) {
1541                         ocfs2_error(inode->i_sb,
1542                                     "Inode %llu has bad count in extent list "
1543                                     "at block %llu (next free=%u, count=%u)\n",
1544                                     (unsigned long long)oi->ip_blkno,
1545                                     (unsigned long long)bh->b_blocknr,
1546                                     le16_to_cpu(el->l_next_free_rec),
1547                                     le16_to_cpu(el->l_count));
1548                         ret = -EROFS;
1549                         goto out;
1550                 }
1551
1552                 if (func)
1553                         func(data, bh);
1554         }
1555
1556 out:
1557         /*
1558          * Catch any trailing bh that the loop didn't handle.
1559          */
1560         brelse(bh);
1561
1562         return ret;
1563 }
1564
1565 /*
1566  * Given an initialized path (that is, it has a valid root extent
1567  * list), this function will traverse the btree in search of the path
1568  * which would contain cpos.
1569  *
1570  * The path traveled is recorded in the path structure.
1571  *
1572  * Note that this will not do any comparisons on leaf node extent
1573  * records, so it will work fine in the case that we just added a tree
1574  * branch.
1575  */
1576 struct find_path_data {
1577         int index;
1578         struct ocfs2_path *path;
1579 };
1580 static void find_path_ins(void *data, struct buffer_head *bh)
1581 {
1582         struct find_path_data *fp = data;
1583
1584         get_bh(bh);
1585         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1586         fp->index++;
1587 }
1588 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1589                            u32 cpos)
1590 {
1591         struct find_path_data data;
1592
1593         data.index = 1;
1594         data.path = path;
1595         return __ocfs2_find_path(inode, path_root_el(path), cpos,
1596                                  find_path_ins, &data);
1597 }
1598
1599 static void find_leaf_ins(void *data, struct buffer_head *bh)
1600 {
1601         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1602         struct ocfs2_extent_list *el = &eb->h_list;
1603         struct buffer_head **ret = data;
1604
1605         /* We want to retain only the leaf block. */
1606         if (le16_to_cpu(el->l_tree_depth) == 0) {
1607                 get_bh(bh);
1608                 *ret = bh;
1609         }
1610 }
1611 /*
1612  * Find the leaf block in the tree which would contain cpos. No
1613  * checking of the actual leaf is done.
1614  *
1615  * Some paths want to call this instead of allocating a path structure
1616  * and calling ocfs2_find_path().
1617  *
1618  * This function doesn't handle non btree extent lists.
1619  */
1620 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1621                     u32 cpos, struct buffer_head **leaf_bh)
1622 {
1623         int ret;
1624         struct buffer_head *bh = NULL;
1625
1626         ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1627         if (ret) {
1628                 mlog_errno(ret);
1629                 goto out;
1630         }
1631
1632         *leaf_bh = bh;
1633 out:
1634         return ret;
1635 }
1636
1637 /*
1638  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1639  *
1640  * Basically, we've moved stuff around at the bottom of the tree and
1641  * we need to fix up the extent records above the changes to reflect
1642  * the new changes.
1643  *
1644  * left_rec: the record on the left.
1645  * left_child_el: is the child list pointed to by left_rec
1646  * right_rec: the record to the right of left_rec
1647  * right_child_el: is the child list pointed to by right_rec
1648  *
1649  * By definition, this only works on interior nodes.
1650  */
1651 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1652                                   struct ocfs2_extent_list *left_child_el,
1653                                   struct ocfs2_extent_rec *right_rec,
1654                                   struct ocfs2_extent_list *right_child_el)
1655 {
1656         u32 left_clusters, right_end;
1657
1658         /*
1659          * Interior nodes never have holes. Their cpos is the cpos of
1660          * the leftmost record in their child list. Their cluster
1661          * count covers the full theoretical range of their child list
1662          * - the range between their cpos and the cpos of the record
1663          * immediately to their right.
1664          */
1665         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1666         if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1667                 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1668                 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1669         }
1670         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1671         left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1672
1673         /*
1674          * Calculate the rightmost cluster count boundary before
1675          * moving cpos - we will need to adjust clusters after
1676          * updating e_cpos to keep the same highest cluster count.
1677          */
1678         right_end = le32_to_cpu(right_rec->e_cpos);
1679         right_end += le32_to_cpu(right_rec->e_int_clusters);
1680
1681         right_rec->e_cpos = left_rec->e_cpos;
1682         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1683
1684         right_end -= le32_to_cpu(right_rec->e_cpos);
1685         right_rec->e_int_clusters = cpu_to_le32(right_end);
1686 }
1687
1688 /*
1689  * Adjust the adjacent root node records involved in a
1690  * rotation. left_el_blkno is passed in as a key so that we can easily
1691  * find it's index in the root list.
1692  */
1693 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1694                                       struct ocfs2_extent_list *left_el,
1695                                       struct ocfs2_extent_list *right_el,
1696                                       u64 left_el_blkno)
1697 {
1698         int i;
1699
1700         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1701                le16_to_cpu(left_el->l_tree_depth));
1702
1703         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1704                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1705                         break;
1706         }
1707
1708         /*
1709          * The path walking code should have never returned a root and
1710          * two paths which are not adjacent.
1711          */
1712         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1713
1714         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1715                                       &root_el->l_recs[i + 1], right_el);
1716 }
1717
1718 /*
1719  * We've changed a leaf block (in right_path) and need to reflect that
1720  * change back up the subtree.
1721  *
1722  * This happens in multiple places:
1723  *   - When we've moved an extent record from the left path leaf to the right
1724  *     path leaf to make room for an empty extent in the left path leaf.
1725  *   - When our insert into the right path leaf is at the leftmost edge
1726  *     and requires an update of the path immediately to it's left. This
1727  *     can occur at the end of some types of rotation and appending inserts.
1728  *   - When we've adjusted the last extent record in the left path leaf and the
1729  *     1st extent record in the right path leaf during cross extent block merge.
1730  */
1731 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1732                                        struct ocfs2_path *left_path,
1733                                        struct ocfs2_path *right_path,
1734                                        int subtree_index)
1735 {
1736         int ret, i, idx;
1737         struct ocfs2_extent_list *el, *left_el, *right_el;
1738         struct ocfs2_extent_rec *left_rec, *right_rec;
1739         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1740
1741         /*
1742          * Update the counts and position values within all the
1743          * interior nodes to reflect the leaf rotation we just did.
1744          *
1745          * The root node is handled below the loop.
1746          *
1747          * We begin the loop with right_el and left_el pointing to the
1748          * leaf lists and work our way up.
1749          *
1750          * NOTE: within this loop, left_el and right_el always refer
1751          * to the *child* lists.
1752          */
1753         left_el = path_leaf_el(left_path);
1754         right_el = path_leaf_el(right_path);
1755         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1756                 mlog(0, "Adjust records at index %u\n", i);
1757
1758                 /*
1759                  * One nice property of knowing that all of these
1760                  * nodes are below the root is that we only deal with
1761                  * the leftmost right node record and the rightmost
1762                  * left node record.
1763                  */
1764                 el = left_path->p_node[i].el;
1765                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1766                 left_rec = &el->l_recs[idx];
1767
1768                 el = right_path->p_node[i].el;
1769                 right_rec = &el->l_recs[0];
1770
1771                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1772                                               right_el);
1773
1774                 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1775                 if (ret)
1776                         mlog_errno(ret);
1777
1778                 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1779                 if (ret)
1780                         mlog_errno(ret);
1781
1782                 /*
1783                  * Setup our list pointers now so that the current
1784                  * parents become children in the next iteration.
1785                  */
1786                 left_el = left_path->p_node[i].el;
1787                 right_el = right_path->p_node[i].el;
1788         }
1789
1790         /*
1791          * At the root node, adjust the two adjacent records which
1792          * begin our path to the leaves.
1793          */
1794
1795         el = left_path->p_node[subtree_index].el;
1796         left_el = left_path->p_node[subtree_index + 1].el;
1797         right_el = right_path->p_node[subtree_index + 1].el;
1798
1799         ocfs2_adjust_root_records(el, left_el, right_el,
1800                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
1801
1802         root_bh = left_path->p_node[subtree_index].bh;
1803
1804         ret = ocfs2_journal_dirty(handle, root_bh);
1805         if (ret)
1806                 mlog_errno(ret);
1807 }
1808
1809 static int ocfs2_rotate_subtree_right(struct inode *inode,
1810                                       handle_t *handle,
1811                                       struct ocfs2_path *left_path,
1812                                       struct ocfs2_path *right_path,
1813                                       int subtree_index)
1814 {
1815         int ret, i;
1816         struct buffer_head *right_leaf_bh;
1817         struct buffer_head *left_leaf_bh = NULL;
1818         struct buffer_head *root_bh;
1819         struct ocfs2_extent_list *right_el, *left_el;
1820         struct ocfs2_extent_rec move_rec;
1821
1822         left_leaf_bh = path_leaf_bh(left_path);
1823         left_el = path_leaf_el(left_path);
1824
1825         if (left_el->l_next_free_rec != left_el->l_count) {
1826                 ocfs2_error(inode->i_sb,
1827                             "Inode %llu has non-full interior leaf node %llu"
1828                             "(next free = %u)",
1829                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1830                             (unsigned long long)left_leaf_bh->b_blocknr,
1831                             le16_to_cpu(left_el->l_next_free_rec));
1832                 return -EROFS;
1833         }
1834
1835         /*
1836          * This extent block may already have an empty record, so we
1837          * return early if so.
1838          */
1839         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1840                 return 0;
1841
1842         root_bh = left_path->p_node[subtree_index].bh;
1843         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1844
1845         ret = ocfs2_journal_access(handle, inode, root_bh,
1846                                    OCFS2_JOURNAL_ACCESS_WRITE);
1847         if (ret) {
1848                 mlog_errno(ret);
1849                 goto out;
1850         }
1851
1852         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1853                 ret = ocfs2_journal_access(handle, inode,
1854                                            right_path->p_node[i].bh,
1855                                            OCFS2_JOURNAL_ACCESS_WRITE);
1856                 if (ret) {
1857                         mlog_errno(ret);
1858                         goto out;
1859                 }
1860
1861                 ret = ocfs2_journal_access(handle, inode,
1862                                            left_path->p_node[i].bh,
1863                                            OCFS2_JOURNAL_ACCESS_WRITE);
1864                 if (ret) {
1865                         mlog_errno(ret);
1866                         goto out;
1867                 }
1868         }
1869
1870         right_leaf_bh = path_leaf_bh(right_path);
1871         right_el = path_leaf_el(right_path);
1872
1873         /* This is a code error, not a disk corruption. */
1874         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1875                         "because rightmost leaf block %llu is empty\n",
1876                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1877                         (unsigned long long)right_leaf_bh->b_blocknr);
1878
1879         ocfs2_create_empty_extent(right_el);
1880
1881         ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1882         if (ret) {
1883                 mlog_errno(ret);
1884                 goto out;
1885         }
1886
1887         /* Do the copy now. */
1888         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1889         move_rec = left_el->l_recs[i];
1890         right_el->l_recs[0] = move_rec;
1891
1892         /*
1893          * Clear out the record we just copied and shift everything
1894          * over, leaving an empty extent in the left leaf.
1895          *
1896          * We temporarily subtract from next_free_rec so that the
1897          * shift will lose the tail record (which is now defunct).
1898          */
1899         le16_add_cpu(&left_el->l_next_free_rec, -1);
1900         ocfs2_shift_records_right(left_el);
1901         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1902         le16_add_cpu(&left_el->l_next_free_rec, 1);
1903
1904         ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1905         if (ret) {
1906                 mlog_errno(ret);
1907                 goto out;
1908         }
1909
1910         ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1911                                 subtree_index);
1912
1913 out:
1914         return ret;
1915 }
1916
1917 /*
1918  * Given a full path, determine what cpos value would return us a path
1919  * containing the leaf immediately to the left of the current one.
1920  *
1921  * Will return zero if the path passed in is already the leftmost path.
1922  */
1923 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1924                                          struct ocfs2_path *path, u32 *cpos)
1925 {
1926         int i, j, ret = 0;
1927         u64 blkno;
1928         struct ocfs2_extent_list *el;
1929
1930         BUG_ON(path->p_tree_depth == 0);
1931
1932         *cpos = 0;
1933
1934         blkno = path_leaf_bh(path)->b_blocknr;
1935
1936         /* Start at the tree node just above the leaf and work our way up. */
1937         i = path->p_tree_depth - 1;
1938         while (i >= 0) {
1939                 el = path->p_node[i].el;
1940
1941                 /*
1942                  * Find the extent record just before the one in our
1943                  * path.
1944                  */
1945                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1946                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1947                                 if (j == 0) {
1948                                         if (i == 0) {
1949                                                 /*
1950                                                  * We've determined that the
1951                                                  * path specified is already
1952                                                  * the leftmost one - return a
1953                                                  * cpos of zero.
1954                                                  */
1955                                                 goto out;
1956                                         }
1957                                         /*
1958                                          * The leftmost record points to our
1959                                          * leaf - we need to travel up the
1960                                          * tree one level.
1961                                          */
1962                                         goto next_node;
1963                                 }
1964
1965                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1966                                 *cpos = *cpos + ocfs2_rec_clusters(el,
1967                                                            &el->l_recs[j - 1]);
1968                                 *cpos = *cpos - 1;
1969                                 goto out;
1970                         }
1971                 }
1972
1973                 /*
1974                  * If we got here, we never found a valid node where
1975                  * the tree indicated one should be.
1976                  */
1977                 ocfs2_error(sb,
1978                             "Invalid extent tree at extent block %llu\n",
1979                             (unsigned long long)blkno);
1980                 ret = -EROFS;
1981                 goto out;
1982
1983 next_node:
1984                 blkno = path->p_node[i].bh->b_blocknr;
1985                 i--;
1986         }
1987
1988 out:
1989         return ret;
1990 }
1991
1992 /*
1993  * Extend the transaction by enough credits to complete the rotation,
1994  * and still leave at least the original number of credits allocated
1995  * to this transaction.
1996  */
1997 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1998                                            int op_credits,
1999                                            struct ocfs2_path *path)
2000 {
2001         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2002
2003         if (handle->h_buffer_credits < credits)
2004                 return ocfs2_extend_trans(handle, credits);
2005
2006         return 0;
2007 }
2008
2009 /*
2010  * Trap the case where we're inserting into the theoretical range past
2011  * the _actual_ left leaf range. Otherwise, we'll rotate a record
2012  * whose cpos is less than ours into the right leaf.
2013  *
2014  * It's only necessary to look at the rightmost record of the left
2015  * leaf because the logic that calls us should ensure that the
2016  * theoretical ranges in the path components above the leaves are
2017  * correct.
2018  */
2019 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2020                                                  u32 insert_cpos)
2021 {
2022         struct ocfs2_extent_list *left_el;
2023         struct ocfs2_extent_rec *rec;
2024         int next_free;
2025
2026         left_el = path_leaf_el(left_path);
2027         next_free = le16_to_cpu(left_el->l_next_free_rec);
2028         rec = &left_el->l_recs[next_free - 1];
2029
2030         if (insert_cpos > le32_to_cpu(rec->e_cpos))
2031                 return 1;
2032         return 0;
2033 }
2034
2035 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2036 {
2037         int next_free = le16_to_cpu(el->l_next_free_rec);
2038         unsigned int range;
2039         struct ocfs2_extent_rec *rec;
2040
2041         if (next_free == 0)
2042                 return 0;
2043
2044         rec = &el->l_recs[0];
2045         if (ocfs2_is_empty_extent(rec)) {
2046                 /* Empty list. */
2047                 if (next_free == 1)
2048                         return 0;
2049                 rec = &el->l_recs[1];
2050         }
2051
2052         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2053         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2054                 return 1;
2055         return 0;
2056 }
2057
2058 /*
2059  * Rotate all the records in a btree right one record, starting at insert_cpos.
2060  *
2061  * The path to the rightmost leaf should be passed in.
2062  *
2063  * The array is assumed to be large enough to hold an entire path (tree depth).
2064  *
2065  * Upon succesful return from this function:
2066  *
2067  * - The 'right_path' array will contain a path to the leaf block
2068  *   whose range contains e_cpos.
2069  * - That leaf block will have a single empty extent in list index 0.
2070  * - In the case that the rotation requires a post-insert update,
2071  *   *ret_left_path will contain a valid path which can be passed to
2072  *   ocfs2_insert_path().
2073  */
2074 static int ocfs2_rotate_tree_right(struct inode *inode,
2075                                    handle_t *handle,
2076                                    enum ocfs2_split_type split,
2077                                    u32 insert_cpos,
2078                                    struct ocfs2_path *right_path,
2079                                    struct ocfs2_path **ret_left_path)
2080 {
2081         int ret, start, orig_credits = handle->h_buffer_credits;
2082         u32 cpos;
2083         struct ocfs2_path *left_path = NULL;
2084
2085         *ret_left_path = NULL;
2086
2087         left_path = ocfs2_new_path(path_root_bh(right_path),
2088                                    path_root_el(right_path));
2089         if (!left_path) {
2090                 ret = -ENOMEM;
2091                 mlog_errno(ret);
2092                 goto out;
2093         }
2094
2095         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
2096         if (ret) {
2097                 mlog_errno(ret);
2098                 goto out;
2099         }
2100
2101         mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
2102
2103         /*
2104          * What we want to do here is:
2105          *
2106          * 1) Start with the rightmost path.
2107          *
2108          * 2) Determine a path to the leaf block directly to the left
2109          *    of that leaf.
2110          *
2111          * 3) Determine the 'subtree root' - the lowest level tree node
2112          *    which contains a path to both leaves.
2113          *
2114          * 4) Rotate the subtree.
2115          *
2116          * 5) Find the next subtree by considering the left path to be
2117          *    the new right path.
2118          *
2119          * The check at the top of this while loop also accepts
2120          * insert_cpos == cpos because cpos is only a _theoretical_
2121          * value to get us the left path - insert_cpos might very well
2122          * be filling that hole.
2123          *
2124          * Stop at a cpos of '0' because we either started at the
2125          * leftmost branch (i.e., a tree with one branch and a
2126          * rotation inside of it), or we've gone as far as we can in
2127          * rotating subtrees.
2128          */
2129         while (cpos && insert_cpos <= cpos) {
2130                 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2131                      insert_cpos, cpos);
2132
2133                 ret = ocfs2_find_path(inode, left_path, cpos);
2134                 if (ret) {
2135                         mlog_errno(ret);
2136                         goto out;
2137                 }
2138
2139                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
2140                                 path_leaf_bh(right_path),
2141                                 "Inode %lu: error during insert of %u "
2142                                 "(left path cpos %u) results in two identical "
2143                                 "paths ending at %llu\n",
2144                                 inode->i_ino, insert_cpos, cpos,
2145                                 (unsigned long long)
2146                                 path_leaf_bh(left_path)->b_blocknr);
2147
2148                 if (split == SPLIT_NONE &&
2149                     ocfs2_rotate_requires_path_adjustment(left_path,
2150                                                           insert_cpos)) {
2151
2152                         /*
2153                          * We've rotated the tree as much as we
2154                          * should. The rest is up to
2155                          * ocfs2_insert_path() to complete, after the
2156                          * record insertion. We indicate this
2157                          * situation by returning the left path.
2158                          *
2159                          * The reason we don't adjust the records here
2160                          * before the record insert is that an error
2161                          * later might break the rule where a parent
2162                          * record e_cpos will reflect the actual
2163                          * e_cpos of the 1st nonempty record of the
2164                          * child list.
2165                          */
2166                         *ret_left_path = left_path;
2167                         goto out_ret_path;
2168                 }
2169
2170                 start = ocfs2_find_subtree_root(inode, left_path, right_path);
2171
2172                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2173                      start,
2174                      (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2175                      right_path->p_tree_depth);
2176
2177                 ret = ocfs2_extend_rotate_transaction(handle, start,
2178                                                       orig_credits, right_path);
2179                 if (ret) {
2180                         mlog_errno(ret);
2181                         goto out;
2182                 }
2183
2184                 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
2185                                                  right_path, start);
2186                 if (ret) {
2187                         mlog_errno(ret);
2188                         goto out;
2189                 }
2190
2191                 if (split != SPLIT_NONE &&
2192                     ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2193                                                 insert_cpos)) {
2194                         /*
2195                          * A rotate moves the rightmost left leaf
2196                          * record over to the leftmost right leaf
2197                          * slot. If we're doing an extent split
2198                          * instead of a real insert, then we have to
2199                          * check that the extent to be split wasn't
2200                          * just moved over. If it was, then we can
2201                          * exit here, passing left_path back -
2202                          * ocfs2_split_extent() is smart enough to
2203                          * search both leaves.
2204                          */
2205                         *ret_left_path = left_path;
2206                         goto out_ret_path;
2207                 }
2208
2209                 /*
2210                  * There is no need to re-read the next right path
2211                  * as we know that it'll be our current left
2212                  * path. Optimize by copying values instead.
2213                  */
2214                 ocfs2_mv_path(right_path, left_path);
2215
2216                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
2217                                                     &cpos);
2218                 if (ret) {
2219                         mlog_errno(ret);
2220                         goto out;
2221                 }
2222         }
2223
2224 out:
2225         ocfs2_free_path(left_path);
2226
2227 out_ret_path:
2228         return ret;
2229 }
2230
2231 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2232                                       struct ocfs2_path *path)
2233 {
2234         int i, idx;
2235         struct ocfs2_extent_rec *rec;
2236         struct ocfs2_extent_list *el;
2237         struct ocfs2_extent_block *eb;
2238         u32 range;
2239
2240         /* Path should always be rightmost. */
2241         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2242         BUG_ON(eb->h_next_leaf_blk != 0ULL);
2243
2244         el = &eb->h_list;
2245         BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2246         idx = le16_to_cpu(el->l_next_free_rec) - 1;
2247         rec = &el->l_recs[idx];
2248         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2249
2250         for (i = 0; i < path->p_tree_depth; i++) {
2251                 el = path->p_node[i].el;
2252                 idx = le16_to_cpu(el->l_next_free_rec) - 1;
2253                 rec = &el->l_recs[idx];
2254
2255                 rec->e_int_clusters = cpu_to_le32(range);
2256                 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2257
2258                 ocfs2_journal_dirty(handle, path->p_node[i].bh);
2259         }
2260 }
2261
2262 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2263                               struct ocfs2_cached_dealloc_ctxt *dealloc,
2264                               struct ocfs2_path *path, int unlink_start)
2265 {
2266         int ret, i;
2267         struct ocfs2_extent_block *eb;
2268         struct ocfs2_extent_list *el;
2269         struct buffer_head *bh;
2270
2271         for(i = unlink_start; i < path_num_items(path); i++) {
2272                 bh = path->p_node[i].bh;
2273
2274                 eb = (struct ocfs2_extent_block *)bh->b_data;
2275                 /*
2276                  * Not all nodes might have had their final count
2277                  * decremented by the caller - handle this here.
2278                  */
2279                 el = &eb->h_list;
2280                 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2281                         mlog(ML_ERROR,
2282                              "Inode %llu, attempted to remove extent block "
2283                              "%llu with %u records\n",
2284                              (unsigned long long)OCFS2_I(inode)->ip_blkno,
2285                              (unsigned long long)le64_to_cpu(eb->h_blkno),
2286                              le16_to_cpu(el->l_next_free_rec));
2287
2288                         ocfs2_journal_dirty(handle, bh);
2289                         ocfs2_remove_from_cache(inode, bh);
2290                         continue;
2291                 }
2292
2293                 el->l_next_free_rec = 0;
2294                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2295
2296                 ocfs2_journal_dirty(handle, bh);
2297
2298                 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2299                 if (ret)
2300                         mlog_errno(ret);
2301
2302                 ocfs2_remove_from_cache(inode, bh);
2303         }
2304 }
2305
2306 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2307                                  struct ocfs2_path *left_path,
2308                                  struct ocfs2_path *right_path,
2309                                  int subtree_index,
2310                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
2311 {
2312         int i;
2313         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2314         struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2315         struct ocfs2_extent_list *el;
2316         struct ocfs2_extent_block *eb;
2317
2318         el = path_leaf_el(left_path);
2319
2320         eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2321
2322         for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2323                 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2324                         break;
2325
2326         BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2327
2328         memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2329         le16_add_cpu(&root_el->l_next_free_rec, -1);
2330
2331         eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2332         eb->h_next_leaf_blk = 0;
2333
2334         ocfs2_journal_dirty(handle, root_bh);
2335         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2336
2337         ocfs2_unlink_path(inode, handle, dealloc, right_path,
2338                           subtree_index + 1);
2339 }
2340
2341 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2342                                      struct ocfs2_path *left_path,
2343                                      struct ocfs2_path *right_path,
2344                                      int subtree_index,
2345                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
2346                                      int *deleted,
2347                                      struct ocfs2_extent_tree *et)
2348 {
2349         int ret, i, del_right_subtree = 0, right_has_empty = 0;
2350         struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2351         struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2352         struct ocfs2_extent_block *eb;
2353
2354         *deleted = 0;
2355
2356         right_leaf_el = path_leaf_el(right_path);
2357         left_leaf_el = path_leaf_el(left_path);
2358         root_bh = left_path->p_node[subtree_index].bh;
2359         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2360
2361         if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2362                 return 0;
2363
2364         eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2365         if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2366                 /*
2367                  * It's legal for us to proceed if the right leaf is
2368                  * the rightmost one and it has an empty extent. There
2369                  * are two cases to handle - whether the leaf will be
2370                  * empty after removal or not. If the leaf isn't empty
2371                  * then just remove the empty extent up front. The
2372                  * next block will handle empty leaves by flagging
2373                  * them for unlink.
2374                  *
2375                  * Non rightmost leaves will throw -EAGAIN and the
2376                  * caller can manually move the subtree and retry.
2377                  */
2378
2379                 if (eb->h_next_leaf_blk != 0ULL)
2380                         return -EAGAIN;
2381
2382                 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2383                         ret = ocfs2_journal_access(handle, inode,
2384                                                    path_leaf_bh(right_path),
2385                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2386                         if (ret) {
2387                                 mlog_errno(ret);
2388                                 goto out;
2389                         }
2390
2391                         ocfs2_remove_empty_extent(right_leaf_el);
2392                 } else
2393                         right_has_empty = 1;
2394         }
2395
2396         if (eb->h_next_leaf_blk == 0ULL &&
2397             le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2398                 /*
2399                  * We have to update i_last_eb_blk during the meta
2400                  * data delete.
2401                  */
2402                 ret = ocfs2_journal_access(handle, inode, et_root_bh,
2403                                            OCFS2_JOURNAL_ACCESS_WRITE);
2404                 if (ret) {
2405                         mlog_errno(ret);
2406                         goto out;
2407                 }
2408
2409                 del_right_subtree = 1;
2410         }
2411
2412         /*
2413          * Getting here with an empty extent in the right path implies
2414          * that it's the rightmost path and will be deleted.
2415          */
2416         BUG_ON(right_has_empty && !del_right_subtree);
2417
2418         ret = ocfs2_journal_access(handle, inode, root_bh,
2419                                    OCFS2_JOURNAL_ACCESS_WRITE);
2420         if (ret) {
2421                 mlog_errno(ret);
2422                 goto out;
2423         }
2424
2425         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2426                 ret = ocfs2_journal_access(handle, inode,
2427                                            right_path->p_node[i].bh,
2428                                            OCFS2_JOURNAL_ACCESS_WRITE);
2429                 if (ret) {
2430                         mlog_errno(ret);
2431                         goto out;
2432                 }
2433
2434                 ret = ocfs2_journal_access(handle, inode,
2435                                            left_path->p_node[i].bh,
2436                                            OCFS2_JOURNAL_ACCESS_WRITE);
2437                 if (ret) {
2438                         mlog_errno(ret);
2439                         goto out;
2440                 }
2441         }
2442
2443         if (!right_has_empty) {
2444                 /*
2445                  * Only do this if we're moving a real
2446                  * record. Otherwise, the action is delayed until
2447                  * after removal of the right path in which case we
2448                  * can do a simple shift to remove the empty extent.
2449                  */
2450                 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2451                 memset(&right_leaf_el->l_recs[0], 0,
2452                        sizeof(struct ocfs2_extent_rec));
2453         }
2454         if (eb->h_next_leaf_blk == 0ULL) {
2455                 /*
2456                  * Move recs over to get rid of empty extent, decrease
2457                  * next_free. This is allowed to remove the last
2458                  * extent in our leaf (setting l_next_free_rec to
2459                  * zero) - the delete code below won't care.
2460                  */
2461                 ocfs2_remove_empty_extent(right_leaf_el);
2462         }
2463
2464         ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2465         if (ret)
2466                 mlog_errno(ret);
2467         ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2468         if (ret)
2469                 mlog_errno(ret);
2470
2471         if (del_right_subtree) {
2472                 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2473                                      subtree_index, dealloc);
2474                 ocfs2_update_edge_lengths(inode, handle, left_path);
2475
2476                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2477                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2478
2479                 /*
2480                  * Removal of the extent in the left leaf was skipped
2481                  * above so we could delete the right path
2482                  * 1st.
2483                  */
2484                 if (right_has_empty)
2485                         ocfs2_remove_empty_extent(left_leaf_el);
2486
2487                 ret = ocfs2_journal_dirty(handle, et_root_bh);
2488                 if (ret)
2489                         mlog_errno(ret);
2490
2491                 *deleted = 1;
2492         } else
2493                 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2494                                            subtree_index);
2495
2496 out:
2497         return ret;
2498 }
2499
2500 /*
2501  * Given a full path, determine what cpos value would return us a path
2502  * containing the leaf immediately to the right of the current one.
2503  *
2504  * Will return zero if the path passed in is already the rightmost path.
2505  *
2506  * This looks similar, but is subtly different to
2507  * ocfs2_find_cpos_for_left_leaf().
2508  */
2509 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2510                                           struct ocfs2_path *path, u32 *cpos)
2511 {
2512         int i, j, ret = 0;
2513         u64 blkno;
2514         struct ocfs2_extent_list *el;
2515
2516         *cpos = 0;
2517
2518         if (path->p_tree_depth == 0)
2519                 return 0;
2520
2521         blkno = path_leaf_bh(path)->b_blocknr;
2522
2523         /* Start at the tree node just above the leaf and work our way up. */
2524         i = path->p_tree_depth - 1;
2525         while (i >= 0) {
2526                 int next_free;
2527
2528                 el = path->p_node[i].el;
2529
2530                 /*
2531                  * Find the extent record just after the one in our
2532                  * path.
2533                  */
2534                 next_free = le16_to_cpu(el->l_next_free_rec);
2535                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2536                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2537                                 if (j == (next_free - 1)) {
2538                                         if (i == 0) {
2539                                                 /*
2540                                                  * We've determined that the
2541                                                  * path specified is already
2542                                                  * the rightmost one - return a
2543                                                  * cpos of zero.
2544                                                  */
2545                                                 goto out;
2546                                         }
2547                                         /*
2548                                          * The rightmost record points to our
2549                                          * leaf - we need to travel up the
2550                                          * tree one level.
2551                                          */
2552                                         goto next_node;
2553                                 }
2554
2555                                 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2556                                 goto out;
2557                         }
2558                 }
2559
2560                 /*
2561                  * If we got here, we never found a valid node where
2562                  * the tree indicated one should be.
2563                  */
2564                 ocfs2_error(sb,
2565                             "Invalid extent tree at extent block %llu\n",
2566                             (unsigned long long)blkno);
2567                 ret = -EROFS;
2568                 goto out;
2569
2570 next_node:
2571                 blkno = path->p_node[i].bh->b_blocknr;
2572                 i--;
2573         }
2574
2575 out:
2576         return ret;
2577 }
2578
2579 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2580                                             handle_t *handle,
2581                                             struct buffer_head *bh,
2582                                             struct ocfs2_extent_list *el)
2583 {
2584         int ret;
2585
2586         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2587                 return 0;
2588
2589         ret = ocfs2_journal_access(handle, inode, bh,
2590                                    OCFS2_JOURNAL_ACCESS_WRITE);
2591         if (ret) {
2592                 mlog_errno(ret);
2593                 goto out;
2594         }
2595
2596         ocfs2_remove_empty_extent(el);
2597
2598         ret = ocfs2_journal_dirty(handle, bh);
2599         if (ret)
2600                 mlog_errno(ret);
2601
2602 out:
2603         return ret;
2604 }
2605
2606 static int __ocfs2_rotate_tree_left(struct inode *inode,
2607                                     handle_t *handle, int orig_credits,
2608                                     struct ocfs2_path *path,
2609                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2610                                     struct ocfs2_path **empty_extent_path,
2611                                     struct ocfs2_extent_tree *et)
2612 {
2613         int ret, subtree_root, deleted;
2614         u32 right_cpos;
2615         struct ocfs2_path *left_path = NULL;
2616         struct ocfs2_path *right_path = NULL;
2617
2618         BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2619
2620         *empty_extent_path = NULL;
2621
2622         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2623                                              &right_cpos);
2624         if (ret) {
2625                 mlog_errno(ret);
2626                 goto out;
2627         }
2628
2629         left_path = ocfs2_new_path(path_root_bh(path),
2630                                    path_root_el(path));
2631         if (!left_path) {
2632                 ret = -ENOMEM;
2633                 mlog_errno(ret);
2634                 goto out;
2635         }
2636
2637         ocfs2_cp_path(left_path, path);
2638
2639         right_path = ocfs2_new_path(path_root_bh(path),
2640                                     path_root_el(path));
2641         if (!right_path) {
2642                 ret = -ENOMEM;
2643                 mlog_errno(ret);
2644                 goto out;
2645         }
2646
2647         while (right_cpos) {
2648                 ret = ocfs2_find_path(inode, right_path, right_cpos);
2649                 if (ret) {
2650                         mlog_errno(ret);
2651                         goto out;
2652                 }
2653
2654                 subtree_root = ocfs2_find_subtree_root(inode, left_path,
2655                                                        right_path);
2656
2657                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2658                      subtree_root,
2659                      (unsigned long long)
2660                      right_path->p_node[subtree_root].bh->b_blocknr,
2661                      right_path->p_tree_depth);
2662
2663                 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2664                                                       orig_credits, left_path);
2665                 if (ret) {
2666                         mlog_errno(ret);
2667                         goto out;
2668                 }
2669
2670                 /*
2671                  * Caller might still want to make changes to the
2672                  * tree root, so re-add it to the journal here.
2673                  */
2674                 ret = ocfs2_journal_access(handle, inode,
2675                                            path_root_bh(left_path),
2676                                            OCFS2_JOURNAL_ACCESS_WRITE);
2677                 if (ret) {
2678                         mlog_errno(ret);
2679                         goto out;
2680                 }
2681
2682                 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2683                                                 right_path, subtree_root,
2684                                                 dealloc, &deleted, et);
2685                 if (ret == -EAGAIN) {
2686                         /*
2687                          * The rotation has to temporarily stop due to
2688                          * the right subtree having an empty
2689                          * extent. Pass it back to the caller for a
2690                          * fixup.
2691                          */
2692                         *empty_extent_path = right_path;
2693                         right_path = NULL;
2694                         goto out;
2695                 }
2696                 if (ret) {
2697                         mlog_errno(ret);
2698                         goto out;
2699                 }
2700
2701                 /*
2702                  * The subtree rotate might have removed records on
2703                  * the rightmost edge. If so, then rotation is
2704                  * complete.
2705                  */
2706                 if (deleted)
2707                         break;
2708
2709                 ocfs2_mv_path(left_path, right_path);
2710
2711                 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2712                                                      &right_cpos);
2713                 if (ret) {
2714                         mlog_errno(ret);
2715                         goto out;
2716                 }
2717         }
2718
2719 out:
2720         ocfs2_free_path(right_path);
2721         ocfs2_free_path(left_path);
2722
2723         return ret;
2724 }
2725
2726 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2727                                 struct ocfs2_path *path,
2728                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
2729                                 struct ocfs2_extent_tree *et)
2730 {
2731         int ret, subtree_index;
2732         u32 cpos;
2733         struct ocfs2_path *left_path = NULL;
2734         struct ocfs2_extent_block *eb;
2735         struct ocfs2_extent_list *el;
2736
2737
2738         ret = ocfs2_et_sanity_check(inode, et);
2739         if (ret)
2740                 goto out;
2741         /*
2742          * There's two ways we handle this depending on
2743          * whether path is the only existing one.
2744          */
2745         ret = ocfs2_extend_rotate_transaction(handle, 0,
2746                                               handle->h_buffer_credits,
2747                                               path);
2748         if (ret) {
2749                 mlog_errno(ret);
2750                 goto out;
2751         }
2752
2753         ret = ocfs2_journal_access_path(inode, handle, path);
2754         if (ret) {
2755                 mlog_errno(ret);
2756                 goto out;
2757         }
2758
2759         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2760         if (ret) {
2761                 mlog_errno(ret);
2762                 goto out;
2763         }
2764
2765         if (cpos) {
2766                 /*
2767                  * We have a path to the left of this one - it needs
2768                  * an update too.
2769                  */
2770                 left_path = ocfs2_new_path(path_root_bh(path),
2771                                            path_root_el(path));
2772                 if (!left_path) {
2773                         ret = -ENOMEM;
2774                         mlog_errno(ret);
2775                         goto out;
2776                 }
2777
2778                 ret = ocfs2_find_path(inode, left_path, cpos);
2779                 if (ret) {
2780                         mlog_errno(ret);
2781                         goto out;
2782                 }
2783
2784                 ret = ocfs2_journal_access_path(inode, handle, left_path);
2785                 if (ret) {
2786                         mlog_errno(ret);
2787                         goto out;
2788                 }
2789
2790                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2791
2792                 ocfs2_unlink_subtree(inode, handle, left_path, path,
2793                                      subtree_index, dealloc);
2794                 ocfs2_update_edge_lengths(inode, handle, left_path);
2795
2796                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2797                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2798         } else {
2799                 /*
2800                  * 'path' is also the leftmost path which
2801                  * means it must be the only one. This gets
2802                  * handled differently because we want to
2803                  * revert the inode back to having extents
2804                  * in-line.
2805                  */
2806                 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2807
2808                 el = et->et_root_el;
2809                 el->l_tree_depth = 0;
2810                 el->l_next_free_rec = 0;
2811                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2812
2813                 ocfs2_et_set_last_eb_blk(et, 0);
2814         }
2815
2816         ocfs2_journal_dirty(handle, path_root_bh(path));
2817
2818 out:
2819         ocfs2_free_path(left_path);
2820         return ret;
2821 }
2822
2823 /*
2824  * Left rotation of btree records.
2825  *
2826  * In many ways, this is (unsurprisingly) the opposite of right
2827  * rotation. We start at some non-rightmost path containing an empty
2828  * extent in the leaf block. The code works its way to the rightmost
2829  * path by rotating records to the left in every subtree.
2830  *
2831  * This is used by any code which reduces the number of extent records
2832  * in a leaf. After removal, an empty record should be placed in the
2833  * leftmost list position.
2834  *
2835  * This won't handle a length update of the rightmost path records if
2836  * the rightmost tree leaf record is removed so the caller is
2837  * responsible for detecting and correcting that.
2838  */
2839 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2840                                   struct ocfs2_path *path,
2841                                   struct ocfs2_cached_dealloc_ctxt *dealloc,
2842                                   struct ocfs2_extent_tree *et)
2843 {
2844         int ret, orig_credits = handle->h_buffer_credits;
2845         struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2846         struct ocfs2_extent_block *eb;
2847         struct ocfs2_extent_list *el;
2848
2849         el = path_leaf_el(path);
2850         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2851                 return 0;
2852
2853         if (path->p_tree_depth == 0) {
2854 rightmost_no_delete:
2855                 /*
2856                  * Inline extents. This is trivially handled, so do
2857                  * it up front.
2858                  */
2859                 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2860                                                        path_leaf_bh(path),
2861                                                        path_leaf_el(path));
2862                 if (ret)
2863                         mlog_errno(ret);
2864                 goto out;
2865         }
2866
2867         /*
2868          * Handle rightmost branch now. There's several cases:
2869          *  1) simple rotation leaving records in there. That's trivial.
2870          *  2) rotation requiring a branch delete - there's no more
2871          *     records left. Two cases of this:
2872          *     a) There are branches to the left.
2873          *     b) This is also the leftmost (the only) branch.
2874          *
2875          *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
2876          *  2a) we need the left branch so that we can update it with the unlink
2877          *  2b) we need to bring the inode back to inline extents.
2878          */
2879
2880         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2881         el = &eb->h_list;
2882         if (eb->h_next_leaf_blk == 0) {
2883                 /*
2884                  * This gets a bit tricky if we're going to delete the
2885                  * rightmost path. Get the other cases out of the way
2886                  * 1st.
2887                  */
2888                 if (le16_to_cpu(el->l_next_free_rec) > 1)
2889                         goto rightmost_no_delete;
2890
2891                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
2892                         ret = -EIO;
2893                         ocfs2_error(inode->i_sb,
2894                                     "Inode %llu has empty extent block at %llu",
2895                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
2896                                     (unsigned long long)le64_to_cpu(eb->h_blkno));
2897                         goto out;
2898                 }
2899
2900                 /*
2901                  * XXX: The caller can not trust "path" any more after
2902                  * this as it will have been deleted. What do we do?
2903                  *
2904                  * In theory the rotate-for-merge code will never get
2905                  * here because it'll always ask for a rotate in a
2906                  * nonempty list.
2907                  */
2908
2909                 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2910                                                   dealloc, et);
2911                 if (ret)
2912                         mlog_errno(ret);
2913                 goto out;
2914         }
2915
2916         /*
2917          * Now we can loop, remembering the path we get from -EAGAIN
2918          * and restarting from there.
2919          */
2920 try_rotate:
2921         ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2922                                        dealloc, &restart_path, et);
2923         if (ret && ret != -EAGAIN) {
2924                 mlog_errno(ret);
2925                 goto out;
2926         }
2927
2928         while (ret == -EAGAIN) {
2929                 tmp_path = restart_path;
2930                 restart_path = NULL;
2931
2932                 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2933                                                tmp_path, dealloc,
2934                                                &restart_path, et);
2935                 if (ret && ret != -EAGAIN) {
2936                         mlog_errno(ret);
2937                         goto out;
2938                 }
2939
2940                 ocfs2_free_path(tmp_path);
2941                 tmp_path = NULL;
2942
2943                 if (ret == 0)
2944                         goto try_rotate;
2945         }
2946
2947 out:
2948         ocfs2_free_path(tmp_path);
2949         ocfs2_free_path(restart_path);
2950         return ret;
2951 }
2952
2953 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2954                                 int index)
2955 {
2956         struct ocfs2_extent_rec *rec = &el->l_recs[index];
2957         unsigned int size;
2958
2959         if (rec->e_leaf_clusters == 0) {
2960                 /*
2961                  * We consumed all of the merged-from record. An empty
2962                  * extent cannot exist anywhere but the 1st array
2963                  * position, so move things over if the merged-from
2964                  * record doesn't occupy that position.
2965                  *
2966                  * This creates a new empty extent so the caller
2967                  * should be smart enough to have removed any existing
2968                  * ones.
2969                  */
2970                 if (index > 0) {
2971                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2972                         size = index * sizeof(struct ocfs2_extent_rec);
2973                         memmove(&el->l_recs[1], &el->l_recs[0], size);
2974                 }
2975
2976                 /*
2977                  * Always memset - the caller doesn't check whether it
2978                  * created an empty extent, so there could be junk in
2979                  * the other fields.
2980                  */
2981                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2982         }
2983 }
2984
2985 static int ocfs2_get_right_path(struct inode *inode,
2986                                 struct ocfs2_path *left_path,
2987                                 struct ocfs2_path **ret_right_path)
2988 {
2989         int ret;
2990         u32 right_cpos;
2991         struct ocfs2_path *right_path = NULL;
2992         struct ocfs2_extent_list *left_el;
2993
2994         *ret_right_path = NULL;
2995
2996         /* This function shouldn't be called for non-trees. */
2997         BUG_ON(left_path->p_tree_depth == 0);
2998
2999         left_el = path_leaf_el(left_path);
3000         BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3001
3002         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
3003                                              &right_cpos);
3004         if (ret) {
3005                 mlog_errno(ret);
3006                 goto out;
3007         }
3008
3009         /* This function shouldn't be called for the rightmost leaf. */
3010         BUG_ON(right_cpos == 0);
3011
3012         right_path = ocfs2_new_path(path_root_bh(left_path),
3013                                     path_root_el(left_path));
3014         if (!right_path) {
3015                 ret = -ENOMEM;
3016                 mlog_errno(ret);
3017                 goto out;
3018         }
3019
3020         ret = ocfs2_find_path(inode, right_path, right_cpos);
3021         if (ret) {
3022                 mlog_errno(ret);
3023                 goto out;
3024         }
3025
3026         *ret_right_path = right_path;
3027 out:
3028         if (ret)
3029                 ocfs2_free_path(right_path);
3030         return ret;
3031 }
3032
3033 /*
3034  * Remove split_rec clusters from the record at index and merge them
3035  * onto the beginning of the record "next" to it.
3036  * For index < l_count - 1, the next means the extent rec at index + 1.
3037  * For index == l_count - 1, the "next" means the 1st extent rec of the
3038  * next extent block.
3039  */
3040 static int ocfs2_merge_rec_right(struct inode *inode,
3041                                  struct ocfs2_path *left_path,
3042                                  handle_t *handle,
3043                                  struct ocfs2_extent_rec *split_rec,
3044                                  int index)
3045 {
3046         int ret, next_free, i;
3047         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3048         struct ocfs2_extent_rec *left_rec;
3049         struct ocfs2_extent_rec *right_rec;
3050         struct ocfs2_extent_list *right_el;
3051         struct ocfs2_path *right_path = NULL;
3052         int subtree_index = 0;
3053         struct ocfs2_extent_list *el = path_leaf_el(left_path);
3054         struct buffer_head *bh = path_leaf_bh(left_path);
3055         struct buffer_head *root_bh = NULL;
3056
3057         BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3058         left_rec = &el->l_recs[index];
3059
3060         if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3061             le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3062                 /* we meet with a cross extent block merge. */
3063                 ret = ocfs2_get_right_path(inode, left_path, &right_path);
3064                 if (ret) {
3065                         mlog_errno(ret);
3066                         goto out;
3067                 }
3068
3069                 right_el = path_leaf_el(right_path);
3070                 next_free = le16_to_cpu(right_el->l_next_free_rec);
3071                 BUG_ON(next_free <= 0);
3072                 right_rec = &right_el->l_recs[0];
3073                 if (ocfs2_is_empty_extent(right_rec)) {
3074                         BUG_ON(next_free <= 1);
3075                         right_rec = &right_el->l_recs[1];
3076                 }
3077
3078                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3079                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3080                        le32_to_cpu(right_rec->e_cpos));
3081
3082                 subtree_index = ocfs2_find_subtree_root(inode,
3083                                                         left_path, right_path);
3084
3085                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3086                                                       handle->h_buffer_credits,
3087                                                       right_path);
3088                 if (ret) {
3089                         mlog_errno(ret);
3090                         goto out;
3091                 }
3092
3093                 root_bh = left_path->p_node[subtree_index].bh;
3094                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3095
3096                 ret = ocfs2_journal_access(handle, inode, root_bh,
3097                                            OCFS2_JOURNAL_ACCESS_WRITE);
3098                 if (ret) {
3099                         mlog_errno(ret);
3100                         goto out;
3101                 }
3102
3103                 for (i = subtree_index + 1;
3104                      i < path_num_items(right_path); i++) {
3105                         ret = ocfs2_journal_access(handle, inode,
3106                                                    right_path->p_node[i].bh,
3107                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3108                         if (ret) {
3109                                 mlog_errno(ret);
3110                                 goto out;
3111                         }
3112
3113                         ret = ocfs2_journal_access(handle, inode,
3114                                                    left_path->p_node[i].bh,
3115                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3116                         if (ret) {
3117                                 mlog_errno(ret);
3118                                 goto out;
3119                         }
3120                 }
3121
3122         } else {
3123                 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3124                 right_rec = &el->l_recs[index + 1];
3125         }
3126
3127         ret = ocfs2_journal_access(handle, inode, bh,
3128                                    OCFS2_JOURNAL_ACCESS_WRITE);
3129         if (ret) {
3130                 mlog_errno(ret);
3131                 goto out;
3132         }
3133
3134         le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3135
3136         le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3137         le64_add_cpu(&right_rec->e_blkno,
3138                      -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3139         le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3140
3141         ocfs2_cleanup_merge(el, index);
3142
3143         ret = ocfs2_journal_dirty(handle, bh);
3144         if (ret)
3145                 mlog_errno(ret);
3146
3147         if (right_path) {
3148                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3149                 if (ret)
3150                         mlog_errno(ret);
3151
3152                 ocfs2_complete_edge_insert(inode, handle, left_path,
3153                                            right_path, subtree_index);
3154         }
3155 out:
3156         if (right_path)
3157                 ocfs2_free_path(right_path);
3158         return ret;
3159 }
3160
3161 static int ocfs2_get_left_path(struct inode *inode,
3162                                struct ocfs2_path *right_path,
3163                                struct ocfs2_path **ret_left_path)
3164 {
3165         int ret;
3166         u32 left_cpos;
3167         struct ocfs2_path *left_path = NULL;
3168
3169         *ret_left_path = NULL;
3170
3171         /* This function shouldn't be called for non-trees. */
3172         BUG_ON(right_path->p_tree_depth == 0);
3173
3174         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3175                                             right_path, &left_cpos);
3176         if (ret) {
3177                 mlog_errno(ret);
3178                 goto out;
3179         }
3180
3181         /* This function shouldn't be called for the leftmost leaf. */
3182         BUG_ON(left_cpos == 0);
3183
3184         left_path = ocfs2_new_path(path_root_bh(right_path),
3185                                    path_root_el(right_path));
3186         if (!left_path) {
3187                 ret = -ENOMEM;
3188                 mlog_errno(ret);
3189                 goto out;
3190         }
3191
3192         ret = ocfs2_find_path(inode, left_path, left_cpos);
3193         if (ret) {
3194                 mlog_errno(ret);
3195                 goto out;
3196         }
3197
3198         *ret_left_path = left_path;
3199 out:
3200         if (ret)
3201                 ocfs2_free_path(left_path);
3202         return ret;
3203 }
3204
3205 /*
3206  * Remove split_rec clusters from the record at index and merge them
3207  * onto the tail of the record "before" it.
3208  * For index > 0, the "before" means the extent rec at index - 1.
3209  *
3210  * For index == 0, the "before" means the last record of the previous
3211  * extent block. And there is also a situation that we may need to
3212  * remove the rightmost leaf extent block in the right_path and change
3213  * the right path to indicate the new rightmost path.
3214  */
3215 static int ocfs2_merge_rec_left(struct inode *inode,
3216                                 struct ocfs2_path *right_path,
3217                                 handle_t *handle,
3218                                 struct ocfs2_extent_rec *split_rec,
3219                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
3220                                 struct ocfs2_extent_tree *et,
3221                                 int index)
3222 {
3223         int ret, i, subtree_index = 0, has_empty_extent = 0;
3224         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3225         struct ocfs2_extent_rec *left_rec;
3226         struct ocfs2_extent_rec *right_rec;
3227         struct ocfs2_extent_list *el = path_leaf_el(right_path);
3228         struct buffer_head *bh = path_leaf_bh(right_path);
3229         struct buffer_head *root_bh = NULL;
3230         struct ocfs2_path *left_path = NULL;
3231         struct ocfs2_extent_list *left_el;
3232
3233         BUG_ON(index < 0);
3234
3235         right_rec = &el->l_recs[index];
3236         if (index == 0) {
3237                 /* we meet with a cross extent block merge. */
3238                 ret = ocfs2_get_left_path(inode, right_path, &left_path);
3239                 if (ret) {
3240                         mlog_errno(ret);
3241                         goto out;
3242                 }
3243
3244                 left_el = path_leaf_el(left_path);
3245                 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3246                        le16_to_cpu(left_el->l_count));
3247
3248                 left_rec = &left_el->l_recs[
3249                                 le16_to_cpu(left_el->l_next_free_rec) - 1];
3250                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3251                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3252                        le32_to_cpu(split_rec->e_cpos));
3253
3254                 subtree_index = ocfs2_find_subtree_root(inode,
3255                                                         left_path, right_path);
3256
3257                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3258                                                       handle->h_buffer_credits,
3259                                                       left_path);
3260                 if (ret) {
3261                         mlog_errno(ret);
3262                         goto out;
3263                 }
3264
3265                 root_bh = left_path->p_node[subtree_index].bh;
3266                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3267
3268                 ret = ocfs2_journal_access(handle, inode, root_bh,
3269                                            OCFS2_JOURNAL_ACCESS_WRITE);
3270                 if (ret) {
3271                         mlog_errno(ret);
3272                         goto out;
3273                 }
3274
3275                 for (i = subtree_index + 1;
3276                      i < path_num_items(right_path); i++) {
3277                         ret = ocfs2_journal_access(handle, inode,
3278                                                    right_path->p_node[i].bh,
3279                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3280                         if (ret) {
3281                                 mlog_errno(ret);
3282                                 goto out;
3283                         }
3284
3285                         ret = ocfs2_journal_access(handle, inode,
3286                                                    left_path->p_node[i].bh,
3287                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3288                         if (ret) {
3289                                 mlog_errno(ret);
3290                                 goto out;
3291                         }
3292                 }
3293         } else {
3294                 left_rec = &el->l_recs[index - 1];
3295                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3296                         has_empty_extent = 1;
3297         }
3298
3299         ret = ocfs2_journal_access(handle, inode, bh,
3300                                    OCFS2_JOURNAL_ACCESS_WRITE);
3301         if (ret) {
3302                 mlog_errno(ret);
3303                 goto out;
3304         }
3305
3306         if (has_empty_extent && index == 1) {
3307                 /*
3308                  * The easy case - we can just plop the record right in.
3309                  */
3310                 *left_rec = *split_rec;
3311
3312                 has_empty_extent = 0;
3313         } else
3314                 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3315
3316         le32_add_cpu(&right_rec->e_cpos, split_clusters);
3317         le64_add_cpu(&right_rec->e_blkno,
3318                      ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3319         le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3320
3321         ocfs2_cleanup_merge(el, index);
3322
3323         ret = ocfs2_journal_dirty(handle, bh);
3324         if (ret)
3325                 mlog_errno(ret);
3326
3327         if (left_path) {
3328                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3329                 if (ret)
3330                         mlog_errno(ret);
3331
3332                 /*
3333                  * In the situation that the right_rec is empty and the extent
3334                  * block is empty also,  ocfs2_complete_edge_insert can't handle
3335                  * it and we need to delete the right extent block.
3336                  */
3337                 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3338                     le16_to_cpu(el->l_next_free_rec) == 1) {
3339
3340                         ret = ocfs2_remove_rightmost_path(inode, handle,
3341                                                           right_path,
3342                                                           dealloc, et);
3343                         if (ret) {
3344                                 mlog_errno(ret);
3345                                 goto out;
3346                         }
3347
3348                         /* Now the rightmost extent block has been deleted.
3349                          * So we use the new rightmost path.
3350                          */
3351                         ocfs2_mv_path(right_path, left_path);
3352                         left_path = NULL;
3353                 } else
3354                         ocfs2_complete_edge_insert(inode, handle, left_path,
3355                                                    right_path, subtree_index);
3356         }
3357 out:
3358         if (left_path)
3359                 ocfs2_free_path(left_path);
3360         return ret;
3361 }
3362
3363 static int ocfs2_try_to_merge_extent(struct inode *inode,
3364                                      handle_t *handle,
3365                                      struct ocfs2_path *path,
3366                                      int split_index,
3367                                      struct ocfs2_extent_rec *split_rec,
3368                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
3369                                      struct ocfs2_merge_ctxt *ctxt,
3370                                      struct ocfs2_extent_tree *et)
3371
3372 {
3373         int ret = 0;
3374         struct ocfs2_extent_list *el = path_leaf_el(path);
3375         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3376
3377         BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3378
3379         if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3380                 /*
3381                  * The merge code will need to create an empty
3382                  * extent to take the place of the newly
3383                  * emptied slot. Remove any pre-existing empty
3384                  * extents - having more than one in a leaf is
3385                  * illegal.
3386                  */
3387                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3388                                              dealloc, et);
3389                 if (ret) {
3390                         mlog_errno(ret);
3391                         goto out;
3392                 }
3393                 split_index--;
3394                 rec = &el->l_recs[split_index];
3395         }
3396
3397         if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3398                 /*
3399                  * Left-right contig implies this.
3400                  */
3401                 BUG_ON(!ctxt->c_split_covers_rec);
3402
3403                 /*
3404                  * Since the leftright insert always covers the entire
3405                  * extent, this call will delete the insert record
3406                  * entirely, resulting in an empty extent record added to
3407                  * the extent block.
3408                  *
3409                  * Since the adding of an empty extent shifts
3410                  * everything back to the right, there's no need to
3411                  * update split_index here.
3412                  *
3413                  * When the split_index is zero, we need to merge it to the
3414                  * prevoius extent block. It is more efficient and easier
3415                  * if we do merge_right first and merge_left later.
3416                  */
3417                 ret = ocfs2_merge_rec_right(inode, path,
3418                                             handle, split_rec,
3419                                             split_index);
3420                 if (ret) {
3421                         mlog_errno(ret);
3422                         goto out;
3423                 }
3424
3425                 /*
3426                  * We can only get this from logic error above.
3427                  */
3428                 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3429
3430                 /* The merge left us with an empty extent, remove it. */
3431                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3432                                              dealloc, et);
3433                 if (ret) {
3434                         mlog_errno(ret);
3435                         goto out;
3436                 }
3437
3438                 rec = &el->l_recs[split_index];
3439
3440                 /*
3441                  * Note that we don't pass split_rec here on purpose -
3442                  * we've merged it into the rec already.
3443                  */
3444                 ret = ocfs2_merge_rec_left(inode, path,
3445                                            handle, rec,
3446                                            dealloc, et,
3447                                            split_index);
3448
3449                 if (ret) {
3450                         mlog_errno(ret);
3451                         goto out;
3452                 }
3453
3454                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3455                                              dealloc, et);
3456                 /*
3457                  * Error from this last rotate is not critical, so
3458                  * print but don't bubble it up.
3459                  */
3460                 if (ret)
3461                         mlog_errno(ret);
3462                 ret = 0;
3463         } else {
3464                 /*
3465                  * Merge a record to the left or right.
3466                  *
3467                  * 'contig_type' is relative to the existing record,
3468                  * so for example, if we're "right contig", it's to
3469                  * the record on the left (hence the left merge).
3470                  */
3471                 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3472                         ret = ocfs2_merge_rec_left(inode,
3473                                                    path,
3474                                                    handle, split_rec,
3475                                                    dealloc, et,
3476                                                    split_index);
3477                         if (ret) {
3478                                 mlog_errno(ret);
3479                                 goto out;
3480                         }
3481                 } else {
3482                         ret = ocfs2_merge_rec_right(inode,
3483                                                     path,
3484                                                     handle, split_rec,
3485                                                     split_index);
3486                         if (ret) {
3487                                 mlog_errno(ret);
3488                                 goto out;
3489                         }
3490                 }
3491
3492                 if (ctxt->c_split_covers_rec) {
3493                         /*
3494                          * The merge may have left an empty extent in
3495                          * our leaf. Try to rotate it away.
3496                          */
3497                         ret = ocfs2_rotate_tree_left(inode, handle, path,
3498                                                      dealloc, et);
3499                         if (ret)
3500                                 mlog_errno(ret);
3501                         ret = 0;
3502                 }
3503         }
3504
3505 out:
3506         return ret;
3507 }
3508
3509 static void ocfs2_subtract_from_rec(struct super_block *sb,
3510                                     enum ocfs2_split_type split,
3511                                     struct ocfs2_extent_rec *rec,
3512                                     struct ocfs2_extent_rec *split_rec)
3513 {
3514         u64 len_blocks;
3515
3516         len_blocks = ocfs2_clusters_to_blocks(sb,
3517                                 le16_to_cpu(split_rec->e_leaf_clusters));
3518
3519         if (split == SPLIT_LEFT) {
3520                 /*
3521                  * Region is on the left edge of the existing
3522                  * record.
3523                  */
3524                 le32_add_cpu(&rec->e_cpos,
3525                              le16_to_cpu(split_rec->e_leaf_clusters));
3526                 le64_add_cpu(&rec->e_blkno, len_blocks);
3527                 le16_add_cpu(&rec->e_leaf_clusters,
3528                              -le16_to_cpu(split_rec->e_leaf_clusters));
3529         } else {
3530                 /*
3531                  * Region is on the right edge of the existing
3532                  * record.
3533                  */
3534                 le16_add_cpu(&rec->e_leaf_clusters,
3535                              -le16_to_cpu(split_rec->e_leaf_clusters));
3536         }
3537 }
3538
3539 /*
3540  * Do the final bits of extent record insertion at the target leaf
3541  * list. If this leaf is part of an allocation tree, it is assumed
3542  * that the tree above has been prepared.
3543  */
3544 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3545                                  struct ocfs2_extent_list *el,
3546                                  struct ocfs2_insert_type *insert,
3547                                  struct inode *inode)
3548 {
3549         int i = insert->ins_contig_index;
3550         unsigned int range;
3551         struct ocfs2_extent_rec *rec;
3552
3553         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3554
3555         if (insert->ins_split != SPLIT_NONE) {
3556                 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3557                 BUG_ON(i == -1);
3558                 rec = &el->l_recs[i];
3559                 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3560                                         insert_rec);
3561                 goto rotate;
3562         }
3563
3564         /*
3565          * Contiguous insert - either left or right.
3566          */
3567         if (insert->ins_contig != CONTIG_NONE) {
3568                 rec = &el->l_recs[i];
3569                 if (insert->ins_contig == CONTIG_LEFT) {
3570                         rec->e_blkno = insert_rec->e_blkno;
3571                         rec->e_cpos = insert_rec->e_cpos;
3572                 }
3573                 le16_add_cpu(&rec->e_leaf_clusters,
3574                              le16_to_cpu(insert_rec->e_leaf_clusters));
3575                 return;
3576         }
3577
3578         /*
3579          * Handle insert into an empty leaf.
3580          */
3581         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3582             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3583              ocfs2_is_empty_extent(&el->l_recs[0]))) {
3584                 el->l_recs[0] = *insert_rec;
3585                 el->l_next_free_rec = cpu_to_le16(1);
3586                 return;
3587         }
3588
3589         /*
3590          * Appending insert.
3591          */
3592         if (insert->ins_appending == APPEND_TAIL) {
3593                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3594                 rec = &el->l_recs[i];
3595                 range = le32_to_cpu(rec->e_cpos)
3596                         + le16_to_cpu(rec->e_leaf_clusters);
3597                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3598
3599                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3600                                 le16_to_cpu(el->l_count),
3601                                 "inode %lu, depth %u, count %u, next free %u, "
3602                                 "rec.cpos %u, rec.clusters %u, "
3603                                 "insert.cpos %u, insert.clusters %u\n",
3604                                 inode->i_ino,
3605                                 le16_to_cpu(el->l_tree_depth),
3606                                 le16_to_cpu(el->l_count),
3607                                 le16_to_cpu(el->l_next_free_rec),
3608                                 le32_to_cpu(el->l_recs[i].e_cpos),
3609                                 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3610                                 le32_to_cpu(insert_rec->e_cpos),
3611                                 le16_to_cpu(insert_rec->e_leaf_clusters));
3612                 i++;
3613                 el->l_recs[i] = *insert_rec;
3614                 le16_add_cpu(&el->l_next_free_rec, 1);
3615                 return;
3616         }
3617
3618 rotate:
3619         /*
3620          * Ok, we have to rotate.
3621          *
3622          * At this point, it is safe to assume that inserting into an
3623          * empty leaf and appending to a leaf have both been handled
3624          * above.
3625          *
3626          * This leaf needs to have space, either by the empty 1st
3627          * extent record, or by virtue of an l_next_rec < l_count.
3628          */
3629         ocfs2_rotate_leaf(el, insert_rec);
3630 }
3631
3632 static void ocfs2_adjust_rightmost_records(struct inode *inode,
3633                                            handle_t *handle,
3634                                            struct ocfs2_path *path,
3635                                            struct ocfs2_extent_rec *insert_rec)
3636 {
3637         int ret, i, next_free;
3638         struct buffer_head *bh;
3639         struct ocfs2_extent_list *el;
3640         struct ocfs2_extent_rec *rec;
3641
3642         /*
3643          * Update everything except the leaf block.
3644          */
3645         for (i = 0; i < path->p_tree_depth; i++) {
3646                 bh = path->p_node[i].bh;
3647                 el = path->p_node[i].el;
3648
3649                 next_free = le16_to_cpu(el->l_next_free_rec);
3650                 if (next_free == 0) {
3651                         ocfs2_error(inode->i_sb,
3652                                     "Dinode %llu has a bad extent list",
3653                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
3654                         ret = -EIO;
3655                         return;
3656                 }
3657
3658                 rec = &el->l_recs[next_free - 1];
3659
3660                 rec->e_int_clusters = insert_rec->e_cpos;
3661                 le32_add_cpu(&rec->e_int_clusters,
3662                              le16_to_cpu(insert_rec->e_leaf_clusters));
3663                 le32_add_cpu(&rec->e_int_clusters,
3664                              -le32_to_cpu(rec->e_cpos));
3665
3666                 ret = ocfs2_journal_dirty(handle, bh);
3667                 if (ret)
3668                         mlog_errno(ret);
3669
3670         }
3671 }
3672
3673 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3674                                     struct ocfs2_extent_rec *insert_rec,
3675                                     struct ocfs2_path *right_path,
3676                                     struct ocfs2_path **ret_left_path)
3677 {
3678         int ret, next_free;
3679         struct ocfs2_extent_list *el;
3680         struct ocfs2_path *left_path = NULL;
3681
3682         *ret_left_path = NULL;
3683
3684         /*
3685          * This shouldn't happen for non-trees. The extent rec cluster
3686          * count manipulation below only works for interior nodes.
3687          */
3688         BUG_ON(right_path->p_tree_depth == 0);
3689
3690         /*
3691          * If our appending insert is at the leftmost edge of a leaf,
3692          * then we might need to update the rightmost records of the
3693          * neighboring path.
3694          */
3695         el = path_leaf_el(right_path);
3696         next_free = le16_to_cpu(el->l_next_free_rec);
3697         if (next_free == 0 ||
3698             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3699                 u32 left_cpos;
3700
3701                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3702                                                     &left_cpos);
3703                 if (ret) {
3704                         mlog_errno(ret);
3705                         goto out;
3706                 }
3707
3708                 mlog(0, "Append may need a left path update. cpos: %u, "
3709                      "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3710                      left_cpos);
3711
3712                 /*
3713                  * No need to worry if the append is already in the
3714                  * leftmost leaf.
3715                  */
3716                 if (left_cpos) {
3717                         left_path = ocfs2_new_path(path_root_bh(right_path),
3718                                                    path_root_el(right_path));
3719                         if (!left_path) {
3720                                 ret = -ENOMEM;
3721                                 mlog_errno(ret);
3722                                 goto out;
3723                         }
3724
3725                         ret = ocfs2_find_path(inode, left_path, left_cpos);
3726                         if (ret) {
3727                                 mlog_errno(ret);
3728                                 goto out;
3729                         }
3730
3731                         /*
3732                          * ocfs2_insert_path() will pass the left_path to the
3733                          * journal for us.
3734                          */
3735                 }
3736         }
3737
3738         ret = ocfs2_journal_access_path(inode, handle, right_path);
3739         if (ret) {
3740                 mlog_errno(ret);
3741                 goto out;
3742         }
3743
3744         ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3745
3746         *ret_left_path = left_path;
3747         ret = 0;
3748 out:
3749         if (ret != 0)
3750                 ocfs2_free_path(left_path);
3751
3752         return ret;
3753 }
3754
3755 static void ocfs2_split_record(struct inode *inode,
3756                                struct ocfs2_path *left_path,
3757                                struct ocfs2_path *right_path,
3758                                struct ocfs2_extent_rec *split_rec,
3759                                enum ocfs2_split_type split)
3760 {
3761         int index;
3762         u32 cpos = le32_to_cpu(split_rec->e_cpos);
3763         struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3764         struct ocfs2_extent_rec *rec, *tmprec;
3765
3766         right_el = path_leaf_el(right_path);;
3767         if (left_path)
3768                 left_el = path_leaf_el(left_path);
3769
3770         el = right_el;
3771         insert_el = right_el;
3772         index = ocfs2_search_extent_list(el, cpos);
3773         if (index != -1) {
3774                 if (index == 0 && left_path) {
3775                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3776
3777                         /*
3778                          * This typically means that the record
3779                          * started in the left path but moved to the
3780                          * right as a result of rotation. We either
3781                          * move the existing record to the left, or we
3782                          * do the later insert there.
3783                          *
3784                          * In this case, the left path should always
3785                          * exist as the rotate code will have passed
3786                          * it back for a post-insert update.
3787                          */
3788
3789                         if (split == SPLIT_LEFT) {
3790                                 /*
3791                                  * It's a left split. Since we know
3792                                  * that the rotate code gave us an
3793                                  * empty extent in the left path, we
3794                                  * can just do the insert there.
3795                                  */
3796                                 insert_el = left_el;
3797                         } else {
3798                                 /*
3799                                  * Right split - we have to move the
3800                                  * existing record over to the left
3801                                  * leaf. The insert will be into the
3802                                  * newly created empty extent in the
3803                                  * right leaf.
3804                                  */
3805                                 tmprec = &right_el->l_recs[index];
3806                                 ocfs2_rotate_leaf(left_el, tmprec);
3807                                 el = left_el;
3808
3809                                 memset(tmprec, 0, sizeof(*tmprec));
3810                                 index = ocfs2_search_extent_list(left_el, cpos);
3811                                 BUG_ON(index == -1);
3812                         }
3813                 }
3814         } else {
3815                 BUG_ON(!left_path);
3816                 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3817                 /*
3818                  * Left path is easy - we can just allow the insert to
3819                  * happen.
3820                  */
3821                 el = left_el;
3822                 insert_el = left_el;
3823                 index = ocfs2_search_extent_list(el, cpos);
3824                 BUG_ON(index == -1);
3825         }
3826
3827         rec = &el->l_recs[index];
3828         ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3829         ocfs2_rotate_leaf(insert_el, split_rec);
3830 }
3831
3832 /*
3833  * This function only does inserts on an allocation b-tree. For tree
3834  * depth = 0, ocfs2_insert_at_leaf() is called directly.
3835  *
3836  * right_path is the path we want to do the actual insert
3837  * in. left_path should only be passed in if we need to update that
3838  * portion of the tree after an edge insert.
3839  */
3840 static int ocfs2_insert_path(struct inode *inode,
3841                              handle_t *handle,
3842                              struct ocfs2_path *left_path,
3843                              struct ocfs2_path *right_path,
3844                              struct ocfs2_extent_rec *insert_rec,
3845                              struct ocfs2_insert_type *insert)
3846 {
3847         int ret, subtree_index;
3848         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3849
3850         if (left_path) {
3851                 int credits = handle->h_buffer_credits;
3852
3853                 /*
3854                  * There's a chance that left_path got passed back to
3855                  * us without being accounted for in the
3856                  * journal. Extend our transaction here to be sure we
3857                  * can change those blocks.
3858                  */
3859                 credits += left_path->p_tree_depth;
3860
3861                 ret = ocfs2_extend_trans(handle, credits);
3862                 if (ret < 0) {
3863                         mlog_errno(ret);
3864                         goto out;
3865                 }
3866
3867                 ret = ocfs2_journal_access_path(inode, handle, left_path);
3868                 if (ret < 0) {
3869                         mlog_errno(ret);
3870                         goto out;
3871                 }
3872         }
3873
3874         /*
3875          * Pass both paths to the journal. The majority of inserts
3876          * will be touching all components anyway.
3877          */
3878         ret = ocfs2_journal_access_path(inode, handle, right_path);
3879         if (ret < 0) {
3880                 mlog_errno(ret);
3881                 goto out;
3882         }
3883
3884         if (insert->ins_split != SPLIT_NONE) {
3885                 /*
3886                  * We could call ocfs2_insert_at_leaf() for some types
3887                  * of splits, but it's easier to just let one separate
3888                  * function sort it all out.
3889                  */
3890                 ocfs2_split_record(inode, left_path, right_path,
3891                                    insert_rec, insert->ins_split);
3892
3893                 /*
3894                  * Split might have modified either leaf and we don't
3895                  * have a guarantee that the later edge insert will
3896                  * dirty this for us.
3897                  */
3898                 if (left_path)
3899                         ret = ocfs2_journal_dirty(handle,
3900                                                   path_leaf_bh(left_path));
3901                         if (ret)
3902                                 mlog_errno(ret);
3903         } else
3904                 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3905                                      insert, inode);
3906
3907         ret = ocfs2_journal_dirty(handle, leaf_bh);
3908         if (ret)
3909                 mlog_errno(ret);
3910
3911         if (left_path) {
3912                 /*
3913                  * The rotate code has indicated that we need to fix
3914                  * up portions of the tree after the insert.
3915                  *
3916                  * XXX: Should we extend the transaction here?
3917                  */
3918                 subtree_index = ocfs2_find_subtree_root(inode, left_path,
3919                                                         right_path);
3920                 ocfs2_complete_edge_insert(inode, handle, left_path,
3921                                            right_path, subtree_index);
3922         }
3923
3924         ret = 0;
3925 out:
3926         return ret;
3927 }
3928
3929 static int ocfs2_do_insert_extent(struct inode *inode,
3930                                   handle_t *handle,
3931                                   struct ocfs2_extent_tree *et,
3932                                   struct ocfs2_extent_rec *insert_rec,
3933                                   struct ocfs2_insert_type *type)
3934 {
3935         int ret, rotate = 0;
3936         u32 cpos;
3937         struct ocfs2_path *right_path = NULL;
3938         struct ocfs2_path *left_path = NULL;
3939         struct ocfs2_extent_list *el;
3940
3941         el = et->et_root_el;
3942
3943         ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
3944                                    OCFS2_JOURNAL_ACCESS_WRITE);
3945         if (ret) {
3946                 mlog_errno(ret);
3947                 goto out;
3948         }
3949
3950         if (le16_to_cpu(el->l_tree_depth) == 0) {
3951                 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
3952                 goto out_update_clusters;
3953         }
3954
3955         right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
3956         if (!right_path) {
3957                 ret = -ENOMEM;
3958                 mlog_errno(ret);
3959                 goto out;
3960         }
3961
3962         /*
3963          * Determine the path to start with. Rotations need the
3964          * rightmost path, everything else can go directly to the
3965          * target leaf.
3966          */
3967         cpos = le32_to_cpu(insert_rec->e_cpos);
3968         if (type->ins_appending == APPEND_NONE &&
3969             type->ins_contig == CONTIG_NONE) {
3970                 rotate = 1;
3971                 cpos = UINT_MAX;
3972         }
3973
3974         ret = ocfs2_find_path(inode, right_path, cpos);
3975         if (ret) {
3976                 mlog_errno(ret);
3977                 goto out;
3978         }
3979
3980         /*
3981          * Rotations and appends need special treatment - they modify
3982          * parts of the tree's above them.
3983          *
3984          * Both might pass back a path immediate to the left of the
3985          * one being inserted to. This will be cause
3986          * ocfs2_insert_path() to modify the rightmost records of
3987          * left_path to account for an edge insert.
3988          *
3989          * XXX: When modifying this code, keep in mind that an insert
3990          * can wind up skipping both of these two special cases...
3991          */
3992         if (rotate) {
3993                 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
3994                                               le32_to_cpu(insert_rec->e_cpos),
3995                                               right_path, &left_path);
3996                 if (ret) {
3997                         mlog_errno(ret);
3998                         goto out;
3999                 }
4000
4001                 /*
4002                  * ocfs2_rotate_tree_right() might have extended the
4003                  * transaction without re-journaling our tree root.
4004                  */
4005                 ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
4006                                            OCFS2_JOURNAL_ACCESS_WRITE);
4007                 if (ret) {
4008                         mlog_errno(ret);
4009                         goto out;
4010                 }
4011         } else if (type->ins_appending == APPEND_TAIL
4012                    && type->ins_contig != CONTIG_LEFT) {
4013                 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
4014                                                right_path, &left_path);
4015                 if (ret) {
4016                         mlog_errno(ret);
4017                         goto out;
4018                 }
4019         }
4020
4021         ret = ocfs2_insert_path(inode, handle, left_path, right_path,
4022                                 insert_rec, type);
4023         if (ret) {
4024                 mlog_errno(ret);
4025                 goto out;
4026         }
4027
4028 out_update_clusters:
4029         if (type->ins_split == SPLIT_NONE)
4030                 ocfs2_et_update_clusters(inode, et,
4031                                          le16_to_cpu(insert_rec->e_leaf_clusters));
4032
4033         ret = ocfs2_journal_dirty(handle, et->et_root_bh);
4034         if (ret)
4035                 mlog_errno(ret);
4036
4037 out:
4038         ocfs2_free_path(left_path);
4039         ocfs2_free_path(right_path);
4040
4041         return ret;
4042 }
4043
4044 static enum ocfs2_contig_type
4045 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4046                                struct ocfs2_extent_list *el, int index,
4047                                struct ocfs2_extent_rec *split_rec)
4048 {
4049         int status;
4050         enum ocfs2_contig_type ret = CONTIG_NONE;
4051         u32 left_cpos, right_cpos;
4052         struct ocfs2_extent_rec *rec = NULL;
4053         struct ocfs2_extent_list *new_el;
4054         struct ocfs2_path *left_path = NULL, *right_path = NULL;
4055         struct buffer_head *bh;
4056         struct ocfs2_extent_block *eb;
4057
4058         if (index > 0) {
4059                 rec = &el->l_recs[index - 1];
4060         } else if (path->p_tree_depth > 0) {
4061                 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
4062                                                        path, &left_cpos);
4063                 if (status)
4064                         goto out;
4065
4066                 if (left_cpos != 0) {
4067                         left_path = ocfs2_new_path(path_root_bh(path),
4068                                                    path_root_el(path));
4069                         if (!left_path)
4070                                 goto out;
4071
4072                         status = ocfs2_find_path(inode, left_path, left_cpos);
4073                         if (status)
4074                                 goto out;
4075
4076                         new_el = path_leaf_el(left_path);
4077
4078                         if (le16_to_cpu(new_el->l_next_free_rec) !=
4079                             le16_to_cpu(new_el->l_count)) {
4080                                 bh = path_leaf_bh(left_path);
4081                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4082                                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
4083                                                                  eb);
4084                                 goto out;
4085                         }
4086                         rec = &new_el->l_recs[
4087                                 le16_to_cpu(new_el->l_next_free_rec) - 1];
4088                 }
4089         }
4090
4091         /*
4092          * We're careful to check for an empty extent record here -
4093          * the merge code will know what to do if it sees one.
4094          */
4095         if (rec) {
4096                 if (index == 1 && ocfs2_is_empty_extent(rec)) {
4097                         if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4098                                 ret = CONTIG_RIGHT;
4099                 } else {
4100                         ret = ocfs2_extent_contig(inode, rec, split_rec);
4101                 }
4102         }
4103
4104         rec = NULL;
4105         if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4106                 rec = &el->l_recs[index + 1];
4107         else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4108                  path->p_tree_depth > 0) {
4109                 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
4110                                                         path, &right_cpos);
4111                 if (status)
4112                         goto out;
4113
4114                 if (right_cpos == 0)
4115                         goto out;
4116
4117                 right_path = ocfs2_new_path(path_root_bh(path),
4118                                             path_root_el(path));
4119                 if (!right_path)
4120                         goto out;
4121
4122                 status = ocfs2_find_path(inode, right_path, right_cpos);
4123                 if (status)
4124                         goto out;
4125
4126                 new_el = path_leaf_el(right_path);
4127                 rec = &new_el->l_recs[0];
4128                 if (ocfs2_is_empty_extent(rec)) {
4129                         if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4130                                 bh = path_leaf_bh(right_path);
4131                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4132                                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
4133                                                                  eb);
4134                                 goto out;
4135                         }
4136                         rec = &new_el->l_recs[1];
4137                 }
4138         }
4139
4140         if (rec) {
4141                 enum ocfs2_contig_type contig_type;
4142
4143                 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
4144
4145                 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4146                         ret = CONTIG_LEFTRIGHT;
4147                 else if (ret == CONTIG_NONE)
4148                         ret = contig_type;
4149         }
4150
4151 out:
4152         if (left_path)
4153                 ocfs2_free_path(left_path);
4154         if (right_path)
4155                 ocfs2_free_path(right_path);
4156
4157         return ret;
4158 }
4159
4160 static void ocfs2_figure_contig_type(struct inode *inode,
4161                                      struct ocfs2_insert_type *insert,
4162                                      struct ocfs2_extent_list *el,
4163                                      struct ocfs2_extent_rec *insert_rec,
4164                                      struct ocfs2_extent_tree *et)
4165 {
4166         int i;
4167         enum ocfs2_contig_type contig_type = CONTIG_NONE;
4168
4169         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4170
4171         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4172                 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
4173                                                   insert_rec);
4174                 if (contig_type != CONTIG_NONE) {
4175                         insert->ins_contig_index = i;
4176                         break;
4177                 }
4178         }
4179         insert->ins_contig = contig_type;
4180
4181         if (insert->ins_contig != CONTIG_NONE) {
4182                 struct ocfs2_extent_rec *rec =
4183                                 &el->l_recs[insert->ins_contig_index];
4184                 unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4185                                    le16_to_cpu(insert_rec->e_leaf_clusters);
4186
4187                 /*
4188                  * Caller might want us to limit the size of extents, don't
4189                  * calculate contiguousness if we might exceed that limit.
4190                  */
4191                 if (et->et_max_leaf_clusters &&
4192                     (len > et->et_max_leaf_clusters))
4193                         insert->ins_contig = CONTIG_NONE;
4194         }
4195 }
4196
4197 /*
4198  * This should only be called against the righmost leaf extent list.
4199  *
4200  * ocfs2_figure_appending_type() will figure out whether we'll have to
4201  * insert at the tail of the rightmost leaf.
4202  *
4203  * This should also work against the root extent list for tree's with 0
4204  * depth. If we consider the root extent list to be the rightmost leaf node
4205  * then the logic here makes sense.
4206  */
4207 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4208                                         struct ocfs2_extent_list *el,
4209                                         struct ocfs2_extent_rec *insert_rec)
4210 {
4211         int i;
4212         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4213         struct ocfs2_extent_rec *rec;
4214
4215         insert->ins_appending = APPEND_NONE;
4216
4217         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4218
4219         if (!el->l_next_free_rec)
4220                 goto set_tail_append;
4221
4222         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4223                 /* Were all records empty? */
4224                 if (le16_to_cpu(el->l_next_free_rec) == 1)
4225                         goto set_tail_append;
4226         }
4227
4228         i = le16_to_cpu(el->l_next_free_rec) - 1;
4229         rec = &el->l_recs[i];
4230
4231         if (cpos >=
4232             (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4233                 goto set_tail_append;
4234
4235         return;
4236
4237 set_tail_append:
4238         insert->ins_appending = APPEND_TAIL;
4239 }
4240
4241 /*
4242  * Helper function called at the begining of an insert.
4243  *
4244  * This computes a few things that are commonly used in the process of
4245  * inserting into the btree:
4246  *   - Whether the new extent is contiguous with an existing one.
4247  *   - The current tree depth.
4248  *   - Whether the insert is an appending one.
4249  *   - The total # of free records in the tree.
4250  *
4251  * All of the information is stored on the ocfs2_insert_type
4252  * structure.
4253  */
4254 static int ocfs2_figure_insert_type(struct inode *inode,
4255                                     struct ocfs2_extent_tree *et,
4256                                     struct buffer_head **last_eb_bh,
4257                                     struct ocfs2_extent_rec *insert_rec,
4258                                     int *free_records,
4259                                     struct ocfs2_insert_type *insert)
4260 {
4261         int ret;
4262         struct ocfs2_extent_block *eb;
4263         struct ocfs2_extent_list *el;
4264         struct ocfs2_path *path = NULL;
4265         struct buffer_head *bh = NULL;
4266
4267         insert->ins_split = SPLIT_NONE;
4268
4269         el = et->et_root_el;
4270         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4271
4272         if (el->l_tree_depth) {
4273                 /*
4274                  * If we have tree depth, we read in the
4275                  * rightmost extent block ahead of time as
4276                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
4277                  * may want it later.
4278                  */
4279                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4280                                        ocfs2_et_get_last_eb_blk(et), &bh,
4281                                        OCFS2_BH_CACHED, inode);
4282                 if (ret) {
4283                         mlog_exit(ret);
4284                         goto out;
4285                 }
4286                 eb = (struct ocfs2_extent_block *) bh->b_data;
4287                 el = &eb->h_list;
4288         }
4289
4290         /*
4291          * Unless we have a contiguous insert, we'll need to know if
4292          * there is room left in our allocation tree for another
4293          * extent record.
4294          *
4295          * XXX: This test is simplistic, we can search for empty
4296          * extent records too.
4297          */
4298         *free_records = le16_to_cpu(el->l_count) -
4299                 le16_to_cpu(el->l_next_free_rec);
4300
4301         if (!insert->ins_tree_depth) {
4302                 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4303                 ocfs2_figure_appending_type(insert, el, insert_rec);
4304                 return 0;
4305         }
4306
4307         path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4308         if (!path) {
4309                 ret = -ENOMEM;
4310                 mlog_errno(ret);
4311                 goto out;
4312         }
4313
4314         /*
4315          * In the case that we're inserting past what the tree
4316          * currently accounts for, ocfs2_find_path() will return for
4317          * us the rightmost tree path. This is accounted for below in
4318          * the appending code.
4319          */
4320         ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4321         if (ret) {
4322                 mlog_errno(ret);
4323                 goto out;
4324         }
4325
4326         el = path_leaf_el(path);
4327
4328         /*
4329          * Now that we have the path, there's two things we want to determine:
4330          * 1) Contiguousness (also set contig_index if this is so)
4331          *
4332          * 2) Are we doing an append? We can trivially break this up
4333          *     into two types of appends: simple record append, or a
4334          *     rotate inside the tail leaf.
4335          */
4336         ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4337
4338         /*
4339          * The insert code isn't quite ready to deal with all cases of
4340          * left contiguousness. Specifically, if it's an insert into
4341          * the 1st record in a leaf, it will require the adjustment of
4342          * cluster count on the last record of the path directly to it's
4343          * left. For now, just catch that case and fool the layers
4344          * above us. This works just fine for tree_depth == 0, which
4345          * is why we allow that above.
4346          */
4347         if (insert->ins_contig == CONTIG_LEFT &&
4348             insert->ins_contig_index == 0)
4349                 insert->ins_contig = CONTIG_NONE;
4350
4351         /*
4352          * Ok, so we can simply compare against last_eb to figure out
4353          * whether the path doesn't exist. This will only happen in
4354          * the case that we're doing a tail append, so maybe we can
4355          * take advantage of that information somehow.
4356          */
4357         if (ocfs2_et_get_last_eb_blk(et) ==
4358             path_leaf_bh(path)->b_blocknr) {
4359                 /*
4360                  * Ok, ocfs2_find_path() returned us the rightmost
4361                  * tree path. This might be an appending insert. There are
4362                  * two cases:
4363                  *    1) We're doing a true append at the tail:
4364                  *      -This might even be off the end of the leaf
4365                  *    2) We're "appending" by rotating in the tail
4366                  */
4367                 ocfs2_figure_appending_type(insert, el, insert_rec);
4368         }
4369
4370 out:
4371         ocfs2_free_path(path);
4372
4373         if (ret == 0)
4374                 *last_eb_bh = bh;
4375         else
4376                 brelse(bh);
4377         return ret;
4378 }
4379
4380 /*
4381  * Insert an extent into an inode btree.
4382  *
4383  * The caller needs to update fe->i_clusters
4384  */
4385 static int ocfs2_insert_extent(struct ocfs2_super *osb,
4386                                handle_t *handle,
4387                                struct inode *inode,
4388                                struct buffer_head *root_bh,
4389                                u32 cpos,
4390                                u64 start_blk,
4391                                u32 new_clusters,
4392                                u8 flags,
4393                                struct ocfs2_alloc_context *meta_ac,
4394                                struct ocfs2_extent_tree *et)
4395 {
4396         int status;
4397         int uninitialized_var(free_records);
4398         struct buffer_head *last_eb_bh = NULL;
4399         struct ocfs2_insert_type insert = {0, };
4400         struct ocfs2_extent_rec rec;
4401
4402         BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
4403
4404         mlog(0, "add %u clusters at position %u to inode %llu\n",
4405              new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4406
4407         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
4408                         (OCFS2_I(inode)->ip_clusters != cpos),
4409                         "Device %s, asking for sparse allocation: inode %llu, "
4410                         "cpos %u, clusters %u\n",
4411                         osb->dev_str,
4412                         (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
4413                         OCFS2_I(inode)->ip_clusters);
4414
4415         memset(&rec, 0, sizeof(rec));
4416         rec.e_cpos = cpu_to_le32(cpos);
4417         rec.e_blkno = cpu_to_le64(start_blk);
4418         rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4419         rec.e_flags = flags;
4420
4421         status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
4422                                           &free_records, &insert);
4423         if (status < 0) {
4424                 mlog_errno(status);
4425                 goto bail;
4426         }
4427
4428         mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4429              "Insert.contig_index: %d, Insert.free_records: %d, "
4430              "Insert.tree_depth: %d\n",
4431              insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4432              free_records, insert.ins_tree_depth);
4433
4434         if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4435                 status = ocfs2_grow_tree(inode, handle, et,
4436                                          &insert.ins_tree_depth, &last_eb_bh,
4437                                          meta_ac);
4438                 if (status) {
4439                         mlog_errno(status);
4440                         goto bail;
4441                 }
4442         }
4443
4444         /* Finally, we can add clusters. This might rotate the tree for us. */
4445         status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
4446         if (status < 0)
4447                 mlog_errno(status);
4448         else if (et->et_type == OCFS2_DINODE_EXTENT)
4449                 ocfs2_extent_map_insert_rec(inode, &rec);
4450
4451 bail:
4452         if (last_eb_bh)
4453                 brelse(last_eb_bh);
4454
4455         mlog_exit(status);
4456         return status;
4457 }
4458
4459 int ocfs2_dinode_insert_extent(struct ocfs2_super *osb,
4460                                handle_t *handle,
4461                                struct inode *inode,
4462                                struct buffer_head *root_bh,
4463                                u32 cpos,
4464                                u64 start_blk,
4465                                u32 new_clusters,
4466                                u8 flags,
4467                                struct ocfs2_alloc_context *meta_ac)
4468 {
4469         int status;
4470         struct ocfs2_extent_tree et;
4471
4472         ocfs2_get_dinode_extent_tree(&et, inode, root_bh);
4473         status = ocfs2_insert_extent(osb, handle, inode, root_bh,
4474                                      cpos, start_blk, new_clusters,
4475                                      flags, meta_ac, &et);
4476         ocfs2_put_extent_tree(&et);
4477
4478         return status;
4479 }
4480
4481 int ocfs2_xattr_value_insert_extent(struct ocfs2_super *osb,
4482                                     handle_t *handle,
4483                                     struct inode *inode,
4484                                     struct buffer_head *root_bh,
4485                                     u32 cpos,
4486                                     u64 start_blk,
4487                                     u32 new_clusters,
4488                                     u8 flags,
4489                                     struct ocfs2_alloc_context *meta_ac,
4490                                     struct ocfs2_xattr_value_root *xv)
4491 {
4492         int status;
4493         struct ocfs2_extent_tree et;
4494
4495         ocfs2_get_xattr_value_extent_tree(&et, inode, root_bh, xv);
4496         status = ocfs2_insert_extent(osb, handle, inode, root_bh,
4497                                      cpos, start_blk, new_clusters,
4498                                      flags, meta_ac, &et);
4499         ocfs2_put_extent_tree(&et);
4500
4501         return status;
4502 }
4503
4504 int ocfs2_xattr_tree_insert_extent(struct ocfs2_super *osb,
4505                                    handle_t *handle,
4506                                    struct inode *inode,
4507                                    struct buffer_head *root_bh,
4508                                    u32 cpos,
4509                                    u64 start_blk,
4510                                    u32 new_clusters,
4511                                    u8 flags,
4512                                    struct ocfs2_alloc_context *meta_ac)
4513 {
4514         int status;
4515         struct ocfs2_extent_tree et;
4516
4517         ocfs2_get_xattr_tree_extent_tree(&et, inode, root_bh);
4518         status = ocfs2_insert_extent(osb, handle, inode, root_bh,
4519                                      cpos, start_blk, new_clusters,
4520                                      flags, meta_ac, &et);
4521         ocfs2_put_extent_tree(&et);
4522
4523         return status;
4524 }
4525
4526 /*
4527  * Allcate and add clusters into the extent b-tree.
4528  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4529  * The extent b-tree's root is root_el and it should be in root_bh, and
4530  * it is not limited to the file storage. Any extent tree can use this
4531  * function if it implements the proper ocfs2_extent_tree.
4532  */
4533 int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4534                                 struct inode *inode,
4535                                 u32 *logical_offset,
4536                                 u32 clusters_to_add,
4537                                 int mark_unwritten,
4538                                 struct buffer_head *root_bh,
4539                                 struct ocfs2_extent_list *root_el,
4540                                 handle_t *handle,
4541                                 struct ocfs2_alloc_context *data_ac,
4542                                 struct ocfs2_alloc_context *meta_ac,
4543                                 enum ocfs2_alloc_restarted *reason_ret,
4544                                 enum ocfs2_extent_tree_type type,
4545                                 void *obj)
4546 {
4547         int status = 0;
4548         int free_extents;
4549         enum ocfs2_alloc_restarted reason = RESTART_NONE;
4550         u32 bit_off, num_bits;
4551         u64 block;
4552         u8 flags = 0;
4553
4554         BUG_ON(!clusters_to_add);
4555
4556         if (mark_unwritten)
4557                 flags = OCFS2_EXT_UNWRITTEN;
4558
4559         free_extents = ocfs2_num_free_extents(osb, inode, root_bh, type,
4560                                               obj);
4561         if (free_extents < 0) {
4562                 status = free_extents;
4563                 mlog_errno(status);
4564                 goto leave;
4565         }
4566
4567         /* there are two cases which could cause us to EAGAIN in the
4568          * we-need-more-metadata case:
4569          * 1) we haven't reserved *any*
4570          * 2) we are so fragmented, we've needed to add metadata too
4571          *    many times. */
4572         if (!free_extents && !meta_ac) {
4573                 mlog(0, "we haven't reserved any metadata!\n");
4574                 status = -EAGAIN;
4575                 reason = RESTART_META;
4576                 goto leave;
4577         } else if ((!free_extents)
4578                    && (ocfs2_alloc_context_bits_left(meta_ac)
4579                        < ocfs2_extend_meta_needed(root_el))) {
4580                 mlog(0, "filesystem is really fragmented...\n");
4581                 status = -EAGAIN;
4582                 reason = RESTART_META;
4583                 goto leave;
4584         }
4585
4586         status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4587                                         clusters_to_add, &bit_off, &num_bits);
4588         if (status < 0) {
4589                 if (status != -ENOSPC)
4590                         mlog_errno(status);
4591                 goto leave;
4592         }
4593
4594         BUG_ON(num_bits > clusters_to_add);
4595
4596         /* reserve our write early -- insert_extent may update the inode */
4597         status = ocfs2_journal_access(handle, inode, root_bh,
4598                                       OCFS2_JOURNAL_ACCESS_WRITE);
4599         if (status < 0) {
4600                 mlog_errno(status);
4601                 goto leave;
4602         }
4603
4604         block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4605         mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
4606              num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4607         if (type == OCFS2_DINODE_EXTENT)
4608                 status = ocfs2_dinode_insert_extent(osb, handle, inode, root_bh,
4609                                                     *logical_offset, block,
4610                                                     num_bits, flags, meta_ac);
4611         else if (type == OCFS2_XATTR_TREE_EXTENT)
4612                 status = ocfs2_xattr_tree_insert_extent(osb, handle,
4613                                                         inode, root_bh,
4614                                                         *logical_offset,
4615                                                         block, num_bits, flags,
4616                                                         meta_ac);
4617         else
4618                 status = ocfs2_xattr_value_insert_extent(osb, handle,
4619                                                          inode, root_bh,
4620                                                          *logical_offset,
4621                                                          block, num_bits, flags,
4622                                                          meta_ac, obj);
4623         if (status < 0) {
4624                 mlog_errno(status);
4625                 goto leave;
4626         }
4627
4628         status = ocfs2_journal_dirty(handle, root_bh);
4629         if (status < 0) {
4630                 mlog_errno(status);
4631                 goto leave;
4632         }
4633
4634         clusters_to_add -= num_bits;
4635         *logical_offset += num_bits;
4636
4637         if (clusters_to_add) {
4638                 mlog(0, "need to alloc once more, wanted = %u\n",
4639                      clusters_to_add);
4640                 status = -EAGAIN;
4641                 reason = RESTART_TRANS;
4642         }
4643
4644 leave:
4645         mlog_exit(status);
4646         if (reason_ret)
4647                 *reason_ret = reason;
4648         return status;
4649 }
4650
4651 static void ocfs2_make_right_split_rec(struct super_block *sb,
4652                                        struct ocfs2_extent_rec *split_rec,
4653                                        u32 cpos,
4654                                        struct ocfs2_extent_rec *rec)
4655 {
4656         u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4657         u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4658
4659         memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4660
4661         split_rec->e_cpos = cpu_to_le32(cpos);
4662         split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4663
4664         split_rec->e_blkno = rec->e_blkno;
4665         le64_add_cpu(&split_rec->e_blkno,
4666                      ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4667
4668         split_rec->e_flags = rec->e_flags;
4669 }
4670
4671 static int ocfs2_split_and_insert(struct inode *inode,
4672                                   handle_t *handle,
4673                                   struct ocfs2_path *path,
4674                                   struct ocfs2_extent_tree *et,
4675                                   struct buffer_head **last_eb_bh,
4676                                   int split_index,
4677                                   struct ocfs2_extent_rec *orig_split_rec,
4678                                   struct ocfs2_alloc_context *meta_ac)
4679 {
4680         int ret = 0, depth;
4681         unsigned int insert_range, rec_range, do_leftright = 0;
4682         struct ocfs2_extent_rec tmprec;
4683         struct ocfs2_extent_list *rightmost_el;
4684         struct ocfs2_extent_rec rec;
4685         struct ocfs2_extent_rec split_rec = *orig_split_rec;
4686         struct ocfs2_insert_type insert;
4687         struct ocfs2_extent_block *eb;
4688
4689 leftright:
4690         /*
4691          * Store a copy of the record on the stack - it might move
4692          * around as the tree is manipulated below.
4693          */
4694         rec = path_leaf_el(path)->l_recs[split_index];
4695
4696         rightmost_el = et->et_root_el;
4697
4698         depth = le16_to_cpu(rightmost_el->l_tree_depth);
4699         if (depth) {
4700                 BUG_ON(!(*last_eb_bh));
4701                 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4702                 rightmost_el = &eb->h_list;
4703         }
4704
4705         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4706             le16_to_cpu(rightmost_el->l_count)) {
4707                 ret = ocfs2_grow_tree(inode, handle, et,
4708                                       &depth, last_eb_bh, meta_ac);
4709                 if (ret) {
4710                         mlog_errno(ret);
4711                         goto out;
4712                 }
4713         }
4714
4715         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4716         insert.ins_appending = APPEND_NONE;
4717         insert.ins_contig = CONTIG_NONE;
4718         insert.ins_tree_depth = depth;
4719
4720         insert_range = le32_to_cpu(split_rec.e_cpos) +
4721                 le16_to_cpu(split_rec.e_leaf_clusters);
4722         rec_range = le32_to_cpu(rec.e_cpos) +
4723                 le16_to_cpu(rec.e_leaf_clusters);
4724
4725         if (split_rec.e_cpos == rec.e_cpos) {
4726                 insert.ins_split = SPLIT_LEFT;
4727         } else if (insert_range == rec_range) {
4728                 insert.ins_split = SPLIT_RIGHT;
4729         } else {
4730                 /*
4731                  * Left/right split. We fake this as a right split
4732                  * first and then make a second pass as a left split.
4733                  */
4734                 insert.ins_split = SPLIT_RIGHT;
4735
4736                 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4737                                            &rec);
4738
4739                 split_rec = tmprec;
4740
4741                 BUG_ON(do_leftright);
4742                 do_leftright = 1;
4743         }
4744
4745         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4746         if (ret) {
4747                 mlog_errno(ret);
4748                 goto out;
4749         }
4750
4751         if (do_leftright == 1) {
4752                 u32 cpos;
4753                 struct ocfs2_extent_list *el;
4754
4755                 do_leftright++;
4756                 split_rec = *orig_split_rec;
4757
4758                 ocfs2_reinit_path(path, 1);
4759
4760                 cpos = le32_to_cpu(split_rec.e_cpos);
4761                 ret = ocfs2_find_path(inode, path, cpos);
4762                 if (ret) {
4763                         mlog_errno(ret);
4764                         goto out;
4765                 }
4766
4767                 el = path_leaf_el(path);
4768                 split_index = ocfs2_search_extent_list(el, cpos);
4769                 goto leftright;
4770         }
4771 out:
4772
4773         return ret;
4774 }
4775
4776 /*
4777  * Mark part or all of the extent record at split_index in the leaf
4778  * pointed to by path as written. This removes the unwritten
4779  * extent flag.
4780  *
4781  * Care is taken to handle contiguousness so as to not grow the tree.
4782  *
4783  * meta_ac is not strictly necessary - we only truly need it if growth
4784  * of the tree is required. All other cases will degrade into a less
4785  * optimal tree layout.
4786  *
4787  * last_eb_bh should be the rightmost leaf block for any extent
4788  * btree. Since a split may grow the tree or a merge might shrink it,
4789  * the caller cannot trust the contents of that buffer after this call.
4790  *
4791  * This code is optimized for readability - several passes might be
4792  * made over certain portions of the tree. All of those blocks will
4793  * have been brought into cache (and pinned via the journal), so the
4794  * extra overhead is not expressed in terms of disk reads.
4795  */
4796 static int __ocfs2_mark_extent_written(struct inode *inode,
4797                                        struct ocfs2_extent_tree *et,
4798                                        handle_t *handle,
4799                                        struct ocfs2_path *path,
4800                                        int split_index,
4801                                        struct ocfs2_extent_rec *split_rec,
4802                                        struct ocfs2_alloc_context *meta_ac,
4803                                        struct ocfs2_cached_dealloc_ctxt *dealloc)
4804 {
4805         int ret = 0;
4806         struct ocfs2_extent_list *el = path_leaf_el(path);
4807         struct buffer_head *last_eb_bh = NULL;
4808         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
4809         struct ocfs2_merge_ctxt ctxt;
4810         struct ocfs2_extent_list *rightmost_el;
4811
4812         if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
4813                 ret = -EIO;
4814                 mlog_errno(ret);
4815                 goto out;
4816         }
4817
4818         if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
4819             ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
4820              (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
4821                 ret = -EIO;
4822                 mlog_errno(ret);
4823                 goto out;
4824         }
4825
4826         ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
4827                                                             split_index,
4828                                                             split_rec);
4829
4830         /*
4831          * The core merge / split code wants to know how much room is
4832          * left in this inodes allocation tree, so we pass the
4833          * rightmost extent list.
4834          */
4835         if (path->p_tree_depth) {
4836                 struct ocfs2_extent_block *eb;
4837
4838                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4839                                        ocfs2_et_get_last_eb_blk(et),
4840                                        &last_eb_bh, OCFS2_BH_CACHED, inode);
4841                 if (ret) {
4842                         mlog_exit(ret);
4843                         goto out;
4844                 }
4845
4846                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4847                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4848                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4849                         ret = -EROFS;
4850                         goto out;
4851                 }
4852
4853                 rightmost_el = &eb->h_list;
4854         } else
4855                 rightmost_el = path_root_el(path);
4856
4857         if (rec->e_cpos == split_rec->e_cpos &&
4858             rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4859                 ctxt.c_split_covers_rec = 1;
4860         else
4861                 ctxt.c_split_covers_rec = 0;
4862
4863         ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4864
4865         mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4866              split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4867              ctxt.c_split_covers_rec);
4868
4869         if (ctxt.c_contig_type == CONTIG_NONE) {
4870                 if (ctxt.c_split_covers_rec)
4871                         el->l_recs[split_index] = *split_rec;
4872                 else
4873                         ret = ocfs2_split_and_insert(inode, handle, path, et,
4874                                                      &last_eb_bh, split_index,
4875                                                      split_rec, meta_ac);
4876                 if (ret)
4877                         mlog_errno(ret);
4878         } else {
4879                 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4880                                                 split_index, split_rec,
4881                                                 dealloc, &ctxt, et);
4882                 if (ret)
4883                         mlog_errno(ret);
4884         }
4885
4886 out:
4887         brelse(last_eb_bh);
4888         return ret;
4889 }
4890
4891 /*
4892  * Mark the already-existing extent at cpos as written for len clusters.
4893  *
4894  * If the existing extent is larger than the request, initiate a
4895  * split. An attempt will be made at merging with adjacent extents.
4896  *
4897  * The caller is responsible for passing down meta_ac if we'll need it.
4898  */
4899 int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
4900                               handle_t *handle, u32 cpos, u32 len, u32 phys,
4901                               struct ocfs2_alloc_context *meta_ac,
4902                               struct ocfs2_cached_dealloc_ctxt *dealloc,
4903                               enum ocfs2_extent_tree_type et_type,
4904                               void *obj)
4905 {
4906         int ret, index;
4907         u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4908         struct ocfs2_extent_rec split_rec;
4909         struct ocfs2_path *left_path = NULL;
4910         struct ocfs2_extent_list *el;
4911         struct ocfs2_extent_tree et;
4912
4913         mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4914              inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4915
4916         ocfs2_get_extent_tree(&et, inode, root_bh, et_type, obj);
4917
4918         if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4919                 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4920                             "that are being written to, but the feature bit "
4921                             "is not set in the super block.",
4922                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
4923                 ret = -EROFS;
4924                 goto out;
4925         }
4926
4927         /*
4928          * XXX: This should be fixed up so that we just re-insert the
4929          * next extent records.
4930          */
4931         if (et_type == OCFS2_DINODE_EXTENT)
4932                 ocfs2_extent_map_trunc(inode, 0);
4933
4934         left_path = ocfs2_new_path(et.et_root_bh, et.et_root_el);
4935         if (!left_path) {
4936                 ret = -ENOMEM;
4937                 mlog_errno(ret);
4938                 goto out;
4939         }
4940
4941         ret = ocfs2_find_path(inode, left_path, cpos);
4942         if (ret) {
4943                 mlog_errno(ret);
4944                 goto out;
4945         }
4946         el = path_leaf_el(left_path);
4947
4948         index = ocfs2_search_extent_list(el, cpos);
4949         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4950                 ocfs2_error(inode->i_sb,
4951                             "Inode %llu has an extent at cpos %u which can no "
4952                             "longer be found.\n",
4953                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4954                 ret = -EROFS;
4955                 goto out;
4956         }
4957
4958         memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4959         split_rec.e_cpos = cpu_to_le32(cpos);
4960         split_rec.e_leaf_clusters = cpu_to_le16(len);
4961         split_rec.e_blkno = cpu_to_le64(start_blkno);
4962         split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4963         split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4964
4965         ret = __ocfs2_mark_extent_written(inode, &et, handle, left_path,
4966                                           index, &split_rec, meta_ac,
4967                                           dealloc);
4968         if (ret)
4969                 mlog_errno(ret);
4970
4971 out:
4972         ocfs2_free_path(left_path);
4973         ocfs2_put_extent_tree(&et);
4974         return ret;
4975 }
4976
4977 static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
4978                             handle_t *handle, struct ocfs2_path *path,
4979                             int index, u32 new_range,
4980                             struct ocfs2_alloc_context *meta_ac)
4981 {
4982         int ret, depth, credits = handle->h_buffer_credits;
4983         struct buffer_head *last_eb_bh = NULL;
4984         struct ocfs2_extent_block *eb;
4985         struct ocfs2_extent_list *rightmost_el, *el;
4986         struct ocfs2_extent_rec split_rec;
4987         struct ocfs2_extent_rec *rec;
4988         struct ocfs2_insert_type insert;
4989
4990         /*
4991          * Setup the record to split before we grow the tree.
4992          */
4993         el = path_leaf_el(path);
4994         rec = &el->l_recs[index];
4995         ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4996
4997         depth = path->p_tree_depth;
4998         if (depth > 0) {
4999                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
5000                                        ocfs2_et_get_last_eb_blk(et),
5001                                        &last_eb_bh, OCFS2_BH_CACHED, inode);
5002                 if (ret < 0) {
5003                         mlog_errno(ret);
5004                         goto out;
5005                 }
5006
5007                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5008                 rightmost_el = &eb->h_list;
5009         } else
5010                 rightmost_el = path_leaf_el(path);
5011
5012         credits += path->p_tree_depth +
5013                    ocfs2_extend_meta_needed(et->et_root_el);
5014         ret = ocfs2_extend_trans(handle, credits);
5015         if (ret) {
5016                 mlog_errno(ret);
5017                 goto out;
5018         }
5019
5020         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
5021             le16_to_cpu(rightmost_el->l_count)) {
5022                 ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
5023                                       meta_ac);
5024                 if (ret) {
5025                         mlog_errno(ret);
5026                         goto out;
5027                 }
5028         }
5029
5030         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
5031         insert.ins_appending = APPEND_NONE;
5032         insert.ins_contig = CONTIG_NONE;
5033         insert.ins_split = SPLIT_RIGHT;
5034         insert.ins_tree_depth = depth;
5035
5036         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
5037         if (ret)
5038                 mlog_errno(ret);
5039
5040 out:
5041         brelse(last_eb_bh);
5042         return ret;
5043 }
5044
5045 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5046                               struct ocfs2_path *path, int index,
5047                               struct ocfs2_cached_dealloc_ctxt *dealloc,
5048                               u32 cpos, u32 len,
5049                               struct ocfs2_extent_tree *et)
5050 {
5051         int ret;
5052         u32 left_cpos, rec_range, trunc_range;
5053         int wants_rotate = 0, is_rightmost_tree_rec = 0;
5054         struct super_block *sb = inode->i_sb;
5055         struct ocfs2_path *left_path = NULL;
5056         struct ocfs2_extent_list *el = path_leaf_el(path);
5057         struct ocfs2_extent_rec *rec;
5058         struct ocfs2_extent_block *eb;
5059
5060         if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5061                 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5062                 if (ret) {
5063                         mlog_errno(ret);
5064                         goto out;
5065                 }
5066
5067                 index--;
5068         }
5069
5070         if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5071             path->p_tree_depth) {
5072                 /*
5073                  * Check whether this is the rightmost tree record. If
5074                  * we remove all of this record or part of its right
5075                  * edge then an update of the record lengths above it
5076                  * will be required.
5077                  */
5078                 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5079                 if (eb->h_next_leaf_blk == 0)
5080                         is_rightmost_tree_rec = 1;
5081         }
5082
5083         rec = &el->l_recs[index];
5084         if (index == 0 && path->p_tree_depth &&
5085             le32_to_cpu(rec->e_cpos) == cpos) {
5086                 /*
5087                  * Changing the leftmost offset (via partial or whole
5088                  * record truncate) of an interior (or rightmost) path
5089                  * means we have to update the subtree that is formed
5090                  * by this leaf and the one to it's left.
5091                  *
5092                  * There are two cases we can skip:
5093                  *   1) Path is the leftmost one in our inode tree.
5094                  *   2) The leaf is rightmost and will be empty after
5095                  *      we remove the extent record - the rotate code
5096                  *      knows how to update the newly formed edge.
5097                  */
5098
5099                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
5100                                                     &left_cpos);
5101                 if (ret) {
5102                         mlog_errno(ret);
5103                         goto out;
5104                 }
5105
5106                 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5107                         left_path = ocfs2_new_path(path_root_bh(path),
5108                                                    path_root_el(path));
5109                         if (!left_path) {
5110                                 ret = -ENOMEM;
5111                                 mlog_errno(ret);
5112                                 goto out;
5113                         }
5114
5115                         ret = ocfs2_find_path(inode, left_path, left_cpos);
5116                         if (ret) {
5117                                 mlog_errno(ret);
5118                                 goto out;
5119                         }
5120                 }
5121         }
5122
5123         ret = ocfs2_extend_rotate_transaction(handle, 0,
5124                                               handle->h_buffer_credits,
5125                                               path);
5126         if (ret) {
5127                 mlog_errno(ret);
5128                 goto out;
5129         }
5130
5131         ret = ocfs2_journal_access_path(inode, handle, path);
5132         if (ret) {
5133                 mlog_errno(ret);
5134                 goto out;
5135         }
5136
5137         ret = ocfs2_journal_access_path(inode, handle, left_path);
5138         if (ret) {
5139                 mlog_errno(ret);
5140                 goto out;
5141         }
5142
5143         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5144         trunc_range = cpos + len;
5145
5146         if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5147                 int next_free;
5148
5149                 memset(rec, 0, sizeof(*rec));
5150                 ocfs2_cleanup_merge(el, index);
5151                 wants_rotate = 1;
5152
5153                 next_free = le16_to_cpu(el->l_next_free_rec);
5154                 if (is_rightmost_tree_rec && next_free > 1) {
5155                         /*
5156                          * We skip the edge update if this path will
5157                          * be deleted by the rotate code.
5158                          */
5159                         rec = &el->l_recs[next_free - 1];
5160                         ocfs2_adjust_rightmost_records(inode, handle, path,
5161                                                        rec);
5162                 }
5163         } else if (le32_to_cpu(rec->e_cpos) == cpos) {
5164                 /* Remove leftmost portion of the record. */
5165                 le32_add_cpu(&rec->e_cpos, len);
5166                 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5167                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5168         } else if (rec_range == trunc_range) {
5169                 /* Remove rightmost portion of the record */
5170                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5171                 if (is_rightmost_tree_rec)
5172                         ocfs2_adjust_rightmost_records(inode, handle, path, rec);
5173         } else {
5174                 /* Caller should have trapped this. */
5175                 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
5176                      "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
5177                      le32_to_cpu(rec->e_cpos),
5178                      le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5179                 BUG();
5180         }
5181
5182         if (left_path) {
5183                 int subtree_index;
5184
5185                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
5186                 ocfs2_complete_edge_insert(inode, handle, left_path, path,
5187                                            subtree_index);
5188         }
5189
5190         ocfs2_journal_dirty(handle, path_leaf_bh(path));
5191
5192         ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5193         if (ret) {
5194                 mlog_errno(ret);
5195                 goto out;
5196         }
5197
5198 out:
5199         ocfs2_free_path(left_path);
5200         return ret;
5201 }
5202
5203 int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
5204                         u32 cpos, u32 len, handle_t *handle,
5205                         struct ocfs2_alloc_context *meta_ac,
5206                         struct ocfs2_cached_dealloc_ctxt *dealloc,
5207                         enum ocfs2_extent_tree_type et_type,
5208                         void *obj)
5209 {
5210         int ret, index;
5211         u32 rec_range, trunc_range;
5212         struct ocfs2_extent_rec *rec;
5213         struct ocfs2_extent_list *el;
5214         struct ocfs2_path *path = NULL;
5215         struct ocfs2_extent_tree et;
5216
5217         ocfs2_get_extent_tree(&et, inode, root_bh, et_type, obj);
5218
5219         ocfs2_extent_map_trunc(inode, 0);
5220
5221         path = ocfs2_new_path(et.et_root_bh, et.et_root_el);
5222         if (!path) {
5223                 ret = -ENOMEM;
5224                 mlog_errno(ret);
5225                 goto out;
5226         }
5227
5228         ret = ocfs2_find_path(inode, path, cpos);
5229         if (ret) {
5230                 mlog_errno(ret);
5231                 goto out;
5232         }
5233
5234         el = path_leaf_el(path);
5235         index = ocfs2_search_extent_list(el, cpos);
5236         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5237                 ocfs2_error(inode->i_sb,
5238                             "Inode %llu has an extent at cpos %u which can no "
5239                             "longer be found.\n",
5240                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
5241                 ret = -EROFS;
5242                 goto out;
5243         }
5244
5245         /*
5246          * We have 3 cases of extent removal:
5247          *   1) Range covers the entire extent rec
5248          *   2) Range begins or ends on one edge of the extent rec
5249          *   3) Range is in the middle of the extent rec (no shared edges)
5250          *
5251          * For case 1 we remove the extent rec and left rotate to
5252          * fill the hole.
5253          *
5254          * For case 2 we just shrink the existing extent rec, with a
5255          * tree update if the shrinking edge is also the edge of an
5256          * extent block.
5257          *
5258          * For case 3 we do a right split to turn the extent rec into
5259          * something case 2 can handle.
5260          */
5261         rec = &el->l_recs[index];
5262         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5263         trunc_range = cpos + len;
5264
5265         BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5266
5267         mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
5268              "(cpos %u, len %u)\n",
5269              (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
5270              le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5271
5272         if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5273                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5274                                          cpos, len, &et);
5275                 if (ret) {
5276                         mlog_errno(ret);
5277                         goto out;
5278                 }
5279         } else {
5280                 ret = ocfs2_split_tree(inode, &et, handle, path, index,
5281                                        trunc_range, meta_ac);
5282                 if (ret) {
5283                         mlog_errno(ret);
5284                         goto out;
5285                 }
5286
5287                 /*
5288                  * The split could have manipulated the tree enough to
5289                  * move the record location, so we have to look for it again.
5290                  */
5291                 ocfs2_reinit_path(path, 1);
5292
5293                 ret = ocfs2_find_path(inode, path, cpos);
5294                 if (ret) {
5295                         mlog_errno(ret);
5296                         goto out;
5297                 }
5298
5299                 el = path_leaf_el(path);
5300                 index = ocfs2_search_extent_list(el, cpos);
5301                 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5302                         ocfs2_error(inode->i_sb,
5303                                     "Inode %llu: split at cpos %u lost record.",
5304                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5305                                     cpos);
5306                         ret = -EROFS;
5307                         goto out;
5308                 }
5309
5310                 /*
5311                  * Double check our values here. If anything is fishy,
5312                  * it's easier to catch it at the top level.
5313                  */
5314                 rec = &el->l_recs[index];
5315                 rec_range = le32_to_cpu(rec->e_cpos) +
5316                         ocfs2_rec_clusters(el, rec);
5317                 if (rec_range != trunc_range) {
5318                         ocfs2_error(inode->i_sb,
5319                                     "Inode %llu: error after split at cpos %u"
5320                                     "trunc len %u, existing record is (%u,%u)",
5321                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5322                                     cpos, len, le32_to_cpu(rec->e_cpos),
5323                                     ocfs2_rec_clusters(el, rec));
5324                         ret = -EROFS;
5325                         goto out;
5326                 }
5327
5328                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5329                                          cpos, len, &et);
5330                 if (ret) {
5331                         mlog_errno(ret);
5332                         goto out;
5333                 }
5334         }
5335
5336 out:
5337         ocfs2_free_path(path);
5338         ocfs2_put_extent_tree(&et);
5339         return ret;
5340 }
5341
5342 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5343 {
5344         struct buffer_head *tl_bh = osb->osb_tl_bh;
5345         struct ocfs2_dinode *di;
5346         struct ocfs2_truncate_log *tl;
5347
5348         di = (struct ocfs2_dinode *) tl_bh->b_data;
5349         tl = &di->id2.i_dealloc;
5350
5351         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5352                         "slot %d, invalid truncate log parameters: used = "
5353                         "%u, count = %u\n", osb->slot_num,
5354                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5355         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5356 }
5357
5358 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5359                                            unsigned int new_start)
5360 {
5361         unsigned int tail_index;
5362         unsigned int current_tail;
5363
5364         /* No records, nothing to coalesce */
5365         if (!le16_to_cpu(tl->tl_used))
5366                 return 0;
5367
5368         tail_index = le16_to_cpu(tl->tl_used) - 1;
5369         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5370         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5371
5372         return current_tail == new_start;
5373 }
5374
5375 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5376                               handle_t *handle,
5377                               u64 start_blk,
5378                               unsigned int num_clusters)
5379 {
5380         int status, index;
5381         unsigned int start_cluster, tl_count;
5382         struct inode *tl_inode = osb->osb_tl_inode;
5383         struct buffer_head *tl_bh = osb->osb_tl_bh;
5384         struct ocfs2_dinode *di;
5385         struct ocfs2_truncate_log *tl;
5386
5387         mlog_entry("start_blk = %llu, num_clusters = %u\n",
5388                    (unsigned long long)start_blk, num_clusters);
5389
5390         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5391
5392         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5393
5394         di = (struct ocfs2_dinode *) tl_bh->b_data;
5395         tl = &di->id2.i_dealloc;
5396         if (!OCFS2_IS_VALID_DINODE(di)) {
5397                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5398                 status = -EIO;
5399                 goto bail;
5400         }
5401
5402         tl_count = le16_to_cpu(tl->tl_count);
5403         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5404                         tl_count == 0,
5405                         "Truncate record count on #%llu invalid "
5406                         "wanted %u, actual %u\n",
5407                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5408                         ocfs2_truncate_recs_per_inode(osb->sb),
5409                         le16_to_cpu(tl->tl_count));
5410
5411         /* Caller should have known to flush before calling us. */
5412         index = le16_to_cpu(tl->tl_used);
5413         if (index >= tl_count) {
5414                 status = -ENOSPC;
5415                 mlog_errno(status);
5416                 goto bail;
5417         }
5418
5419         status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5420                                       OCFS2_JOURNAL_ACCESS_WRITE);
5421         if (status < 0) {
5422                 mlog_errno(status);
5423                 goto bail;
5424         }
5425
5426         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5427              "%llu (index = %d)\n", num_clusters, start_cluster,
5428              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5429
5430         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5431                 /*
5432                  * Move index back to the record we are coalescing with.
5433                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5434                  */
5435                 index--;
5436
5437                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5438                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5439                      index, le32_to_cpu(tl->tl_recs[index].t_start),
5440                      num_clusters);
5441         } else {
5442                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5443                 tl->tl_used = cpu_to_le16(index + 1);
5444         }
5445         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5446
5447         status = ocfs2_journal_dirty(handle, tl_bh);
5448         if (status < 0) {
5449                 mlog_errno(status);
5450                 goto bail;
5451         }
5452
5453 bail:
5454         mlog_exit(status);
5455         return status;
5456 }
5457
5458 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5459                                          handle_t *handle,
5460                                          struct inode *data_alloc_inode,
5461                                          struct buffer_head *data_alloc_bh)
5462 {
5463         int status = 0;
5464         int i;
5465         unsigned int num_clusters;
5466         u64 start_blk;
5467         struct ocfs2_truncate_rec rec;
5468         struct ocfs2_dinode *di;
5469         struct ocfs2_truncate_log *tl;
5470         struct inode *tl_inode = osb->osb_tl_inode;
5471         struct buffer_head *tl_bh = osb->osb_tl_bh;
5472
5473         mlog_entry_void();
5474
5475         di = (struct ocfs2_dinode *) tl_bh->b_data;
5476         tl = &di->id2.i_dealloc;
5477         i = le16_to_cpu(tl->tl_used) - 1;
5478         while (i >= 0) {
5479                 /* Caller has given us at least enough credits to
5480                  * update the truncate log dinode */
5481                 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5482                                               OCFS2_JOURNAL_ACCESS_WRITE);
5483                 if (status < 0) {
5484                         mlog_errno(status);
5485                         goto bail;
5486                 }
5487
5488                 tl->tl_used = cpu_to_le16(i);
5489
5490                 status = ocfs2_journal_dirty(handle, tl_bh);
5491                 if (status < 0) {
5492                         mlog_errno(status);
5493                         goto bail;
5494                 }
5495
5496                 /* TODO: Perhaps we can calculate the bulk of the
5497                  * credits up front rather than extending like
5498                  * this. */
5499                 status = ocfs2_extend_trans(handle,
5500                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5501                 if (status < 0) {
5502                         mlog_errno(status);
5503                         goto bail;
5504                 }
5505
5506                 rec = tl->tl_recs[i];
5507                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5508                                                     le32_to_cpu(rec.t_start));
5509                 num_clusters = le32_to_cpu(rec.t_clusters);
5510
5511                 /* if start_blk is not set, we ignore the record as
5512                  * invalid. */
5513                 if (start_blk) {
5514                         mlog(0, "free record %d, start = %u, clusters = %u\n",
5515                              i, le32_to_cpu(rec.t_start), num_clusters);
5516
5517                         status = ocfs2_free_clusters(handle, data_alloc_inode,
5518                                                      data_alloc_bh, start_blk,
5519                                                      num_clusters);
5520                         if (status < 0) {
5521                                 mlog_errno(status);
5522                                 goto bail;
5523                         }
5524                 }
5525                 i--;
5526         }
5527
5528 bail:
5529         mlog_exit(status);
5530         return status;
5531 }
5532
5533 /* Expects you to already be holding tl_inode->i_mutex */
5534 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5535 {
5536         int status;
5537         unsigned int num_to_flush;
5538         handle_t *handle;
5539         struct inode *tl_inode = osb->osb_tl_inode;
5540         struct inode *data_alloc_inode = NULL;
5541         struct buffer_head *tl_bh = osb->osb_tl_bh;
5542         struct buffer_head *data_alloc_bh = NULL;
5543         struct ocfs2_dinode *di;
5544         struct ocfs2_truncate_log *tl;
5545
5546         mlog_entry_void();
5547
5548         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5549
5550         di = (struct ocfs2_dinode *) tl_bh->b_data;
5551         tl = &di->id2.i_dealloc;
5552         if (!OCFS2_IS_VALID_DINODE(di)) {
5553                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5554                 status = -EIO;
5555                 goto out;
5556         }
5557
5558         num_to_flush = le16_to_cpu(tl->tl_used);
5559         mlog(0, "Flush %u records from truncate log #%llu\n",
5560              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5561         if (!num_to_flush) {
5562                 status = 0;
5563                 goto out;
5564         }
5565
5566         data_alloc_inode = ocfs2_get_system_file_inode(osb,
5567                                                        GLOBAL_BITMAP_SYSTEM_INODE,
5568                                                        OCFS2_INVALID_SLOT);
5569         if (!data_alloc_inode) {
5570                 status = -EINVAL;
5571                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
5572                 goto out;
5573         }
5574
5575         mutex_lock(&data_alloc_inode->i_mutex);
5576
5577         status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5578         if (status < 0) {
5579                 mlog_errno(status);
5580                 goto out_mutex;
5581         }
5582
5583         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5584         if (IS_ERR(handle)) {
5585                 status = PTR_ERR(handle);
5586                 mlog_errno(status);
5587                 goto out_unlock;
5588         }
5589
5590         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5591                                                data_alloc_bh);
5592         if (status < 0)
5593                 mlog_errno(status);
5594
5595         ocfs2_commit_trans(osb, handle);
5596
5597 out_unlock:
5598         brelse(data_alloc_bh);
5599         ocfs2_inode_unlock(data_alloc_inode, 1);
5600
5601 out_mutex:
5602         mutex_unlock(&data_alloc_inode->i_mutex);
5603         iput(data_alloc_inode);
5604
5605 out:
5606         mlog_exit(status);
5607         return status;
5608 }
5609
5610 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5611 {
5612         int status;
5613         struct inode *tl_inode = osb->osb_tl_inode;
5614
5615         mutex_lock(&tl_inode->i_mutex);
5616         status = __ocfs2_flush_truncate_log(osb);
5617         mutex_unlock(&tl_inode->i_mutex);
5618
5619         return status;
5620 }
5621
5622 static void ocfs2_truncate_log_worker(struct work_struct *work)
5623 {
5624         int status;
5625         struct ocfs2_super *osb =
5626                 container_of(work, struct ocfs2_super,
5627                              osb_truncate_log_wq.work);
5628
5629         mlog_entry_void();
5630
5631         status = ocfs2_flush_truncate_log(osb);
5632         if (status < 0)
5633                 mlog_errno(status);
5634         else
5635                 ocfs2_init_inode_steal_slot(osb);
5636
5637         mlog_exit(status);
5638 }
5639
5640 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5641 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5642                                        int cancel)
5643 {
5644         if (osb->osb_tl_inode) {
5645                 /* We want to push off log flushes while truncates are
5646                  * still running. */
5647                 if (cancel)
5648                         cancel_delayed_work(&osb->osb_truncate_log_wq);
5649
5650                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5651                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5652         }
5653 }
5654
5655 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5656                                        int slot_num,
5657                                        struct inode **tl_inode,
5658                                        struct buffer_head **tl_bh)
5659 {
5660         int status;
5661         struct inode *inode = NULL;
5662         struct buffer_head *bh = NULL;
5663
5664         inode = ocfs2_get_system_file_inode(osb,
5665                                            TRUNCATE_LOG_SYSTEM_INODE,
5666                                            slot_num);
5667         if (!inode) {
5668                 status = -EINVAL;
5669                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5670                 goto bail;
5671         }
5672
5673         status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
5674                                   OCFS2_BH_CACHED, inode);
5675         if (status < 0) {
5676                 iput(inode);
5677                 mlog_errno(status);
5678                 goto bail;
5679         }
5680
5681         *tl_inode = inode;
5682         *tl_bh    = bh;
5683 bail:
5684         mlog_exit(status);
5685         return status;
5686 }
5687
5688 /* called during the 1st stage of node recovery. we stamp a clean
5689  * truncate log and pass back a copy for processing later. if the
5690  * truncate log does not require processing, a *tl_copy is set to
5691  * NULL. */
5692 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5693                                       int slot_num,
5694                                       struct ocfs2_dinode **tl_copy)
5695 {
5696         int status;
5697         struct inode *tl_inode = NULL;
5698         struct buffer_head *tl_bh = NULL;
5699         struct ocfs2_dinode *di;
5700         struct ocfs2_truncate_log *tl;
5701
5702         *tl_copy = NULL;
5703
5704         mlog(0, "recover truncate log from slot %d\n", slot_num);
5705
5706         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5707         if (status < 0) {
5708                 mlog_errno(status);
5709                 goto bail;
5710         }
5711
5712         di = (struct ocfs2_dinode *) tl_bh->b_data;
5713         tl = &di->id2.i_dealloc;
5714         if (!OCFS2_IS_VALID_DINODE(di)) {
5715                 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
5716                 status = -EIO;
5717                 goto bail;
5718         }
5719
5720         if (le16_to_cpu(tl->tl_used)) {
5721                 mlog(0, "We'll have %u logs to recover\n",
5722                      le16_to_cpu(tl->tl_used));
5723
5724                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
5725                 if (!(*tl_copy)) {
5726                         status = -ENOMEM;
5727                         mlog_errno(status);
5728                         goto bail;
5729                 }
5730
5731                 /* Assuming the write-out below goes well, this copy
5732                  * will be passed back to recovery for processing. */
5733                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
5734
5735                 /* All we need to do to clear the truncate log is set
5736                  * tl_used. */
5737                 tl->tl_used = 0;
5738
5739                 status = ocfs2_write_block(osb, tl_bh, tl_inode);
5740                 if (status < 0) {
5741                         mlog_errno(status);
5742                         goto bail;
5743                 }
5744         }
5745
5746 bail:
5747         if (tl_inode)
5748                 iput(tl_inode);
5749         if (tl_bh)
5750                 brelse(tl_bh);
5751
5752         if (status < 0 && (*tl_copy)) {
5753                 kfree(*tl_copy);
5754                 *tl_copy = NULL;
5755         }
5756
5757         mlog_exit(status);
5758         return status;
5759 }
5760
5761 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
5762                                          struct ocfs2_dinode *tl_copy)
5763 {
5764         int status = 0;
5765         int i;
5766         unsigned int clusters, num_recs, start_cluster;
5767         u64 start_blk;
5768         handle_t *handle;
5769         struct inode *tl_inode = osb->osb_tl_inode;
5770         struct ocfs2_truncate_log *tl;
5771
5772         mlog_entry_void();
5773
5774         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
5775                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
5776                 return -EINVAL;
5777         }
5778
5779         tl = &tl_copy->id2.i_dealloc;
5780         num_recs = le16_to_cpu(tl->tl_used);
5781         mlog(0, "cleanup %u records from %llu\n", num_recs,
5782              (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
5783
5784         mutex_lock(&tl_inode->i_mutex);
5785         for(i = 0; i < num_recs; i++) {
5786                 if (ocfs2_truncate_log_needs_flush(osb)) {
5787                         status = __ocfs2_flush_truncate_log(osb);
5788                         if (status < 0) {
5789                                 mlog_errno(status);
5790                                 goto bail_up;
5791                         }
5792                 }
5793
5794                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5795                 if (IS_ERR(handle)) {
5796                         status = PTR_ERR(handle);
5797                         mlog_errno(status);
5798                         goto bail_up;
5799                 }
5800
5801                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
5802                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
5803                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
5804
5805                 status = ocfs2_truncate_log_append(osb, handle,
5806                                                    start_blk, clusters);
5807                 ocfs2_commit_trans(osb, handle);
5808                 if (status < 0) {
5809                         mlog_errno(status);
5810                         goto bail_up;
5811                 }
5812         }
5813
5814 bail_up:
5815         mutex_unlock(&tl_inode->i_mutex);
5816
5817         mlog_exit(status);
5818         return status;
5819 }
5820
5821 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
5822 {
5823         int status;
5824         struct inode *tl_inode = osb->osb_tl_inode;
5825
5826         mlog_entry_void();
5827
5828         if (tl_inode) {
5829                 cancel_delayed_work(&osb->osb_truncate_log_wq);
5830                 flush_workqueue(ocfs2_wq);
5831
5832                 status = ocfs2_flush_truncate_log(osb);
5833                 if (status < 0)
5834                         mlog_errno(status);
5835
5836                 brelse(osb->osb_tl_bh);
5837                 iput(osb->osb_tl_inode);
5838         }
5839
5840         mlog_exit_void();
5841 }
5842
5843 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5844 {
5845         int status;
5846         struct inode *tl_inode = NULL;
5847         struct buffer_head *tl_bh = NULL;
5848
5849         mlog_entry_void();
5850
5851         status = ocfs2_get_truncate_log_info(osb,
5852                                              osb->slot_num,
5853                                              &tl_inode,
5854                                              &tl_bh);
5855         if (status < 0)
5856                 mlog_errno(status);
5857
5858         /* ocfs2_truncate_log_shutdown keys on the existence of
5859          * osb->osb_tl_inode so we don't set any of the osb variables
5860          * until we're sure all is well. */
5861         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
5862                           ocfs2_truncate_log_worker);
5863         osb->osb_tl_bh    = tl_bh;
5864         osb->osb_tl_inode = tl_inode;
5865
5866         mlog_exit(status);
5867         return status;
5868 }
5869
5870 /*
5871  * Delayed de-allocation of suballocator blocks.
5872  *
5873  * Some sets of block de-allocations might involve multiple suballocator inodes.
5874  *
5875  * The locking for this can get extremely complicated, especially when
5876  * the suballocator inodes to delete from aren't known until deep
5877  * within an unrelated codepath.
5878  *
5879  * ocfs2_extent_block structures are a good example of this - an inode
5880  * btree could have been grown by any number of nodes each allocating
5881  * out of their own suballoc inode.
5882  *
5883  * These structures allow the delay of block de-allocation until a
5884  * later time, when locking of multiple cluster inodes won't cause
5885  * deadlock.
5886  */
5887
5888 /*
5889  * Describes a single block free from a suballocator
5890  */
5891 struct ocfs2_cached_block_free {
5892         struct ocfs2_cached_block_free          *free_next;
5893         u64                                     free_blk;
5894         unsigned int                            free_bit;
5895 };
5896
5897 struct ocfs2_per_slot_free_list {
5898         struct ocfs2_per_slot_free_list         *f_next_suballocator;
5899         int                                     f_inode_type;
5900         int                                     f_slot;
5901         struct ocfs2_cached_block_free          *f_first;
5902 };
5903
5904 static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5905                                    int sysfile_type,
5906                                    int slot,
5907                                    struct ocfs2_cached_block_free *head)
5908 {
5909         int ret;
5910         u64 bg_blkno;
5911         handle_t *handle;
5912         struct inode *inode;
5913         struct buffer_head *di_bh = NULL;
5914         struct ocfs2_cached_block_free *tmp;
5915
5916         inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5917         if (!inode) {
5918                 ret = -EINVAL;
5919                 mlog_errno(ret);
5920                 goto out;
5921         }
5922
5923         mutex_lock(&inode->i_mutex);
5924
5925         ret = ocfs2_inode_lock(inode, &di_bh, 1);
5926         if (ret) {
5927                 mlog_errno(ret);
5928                 goto out_mutex;
5929         }
5930
5931         handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5932         if (IS_ERR(handle)) {
5933                 ret = PTR_ERR(handle);
5934                 mlog_errno(ret);
5935                 goto out_unlock;
5936         }
5937
5938         while (head) {
5939                 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5940                                                       head->free_bit);
5941                 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5942                      head->free_bit, (unsigned long long)head->free_blk);
5943
5944                 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5945                                                head->free_bit, bg_blkno, 1);
5946                 if (ret) {
5947                         mlog_errno(ret);
5948                         goto out_journal;
5949                 }
5950
5951                 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5952                 if (ret) {
5953                         mlog_errno(ret);
5954                         goto out_journal;
5955                 }
5956
5957                 tmp = head;
5958                 head = head->free_next;
5959                 kfree(tmp);
5960         }
5961
5962 out_journal:
5963         ocfs2_commit_trans(osb, handle);
5964
5965 out_unlock:
5966         ocfs2_inode_unlock(inode, 1);
5967         brelse(di_bh);
5968 out_mutex:
5969         mutex_unlock(&inode->i_mutex);
5970         iput(inode);
5971 out:
5972         while(head) {
5973                 /* Premature exit may have left some dangling items. */
5974                 tmp = head;
5975                 head = head->free_next;
5976                 kfree(tmp);
5977         }
5978
5979         return ret;
5980 }
5981
5982 int ocfs2_run_deallocs(struct ocfs2_super *osb,
5983                        struct ocfs2_cached_dealloc_ctxt *ctxt)
5984 {
5985         int ret = 0, ret2;
5986         struct ocfs2_per_slot_free_list *fl;
5987
5988         if (!ctxt)
5989                 return 0;
5990
5991         while (ctxt->c_first_suballocator) {
5992                 fl = ctxt->c_first_suballocator;
5993
5994                 if (fl->f_first) {
5995                         mlog(0, "Free items: (type %u, slot %d)\n",
5996                              fl->f_inode_type, fl->f_slot);
5997                         ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
5998                                                        fl->f_slot, fl->f_first);
5999                         if (ret2)
6000                                 mlog_errno(ret2);
6001                         if (!ret)
6002                                 ret = ret2;
6003                 }
6004
6005                 ctxt->c_first_suballocator = fl->f_next_suballocator;
6006                 kfree(fl);
6007         }
6008
6009         return ret;
6010 }
6011
6012 static struct ocfs2_per_slot_free_list *
6013 ocfs2_find_per_slot_free_list(int type,
6014                               int slot,
6015                               struct ocfs2_cached_dealloc_ctxt *ctxt)
6016 {
6017         struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
6018
6019         while (fl) {
6020                 if (fl->f_inode_type == type && fl->f_slot == slot)
6021                         return fl;
6022
6023                 fl = fl->f_next_suballocator;
6024         }
6025
6026         fl = kmalloc(sizeof(*fl), GFP_NOFS);
6027         if (fl) {
6028                 fl->f_inode_type = type;
6029                 fl->f_slot = slot;
6030                 fl->f_first = NULL;
6031                 fl->f_next_suballocator = ctxt->c_first_suballocator;
6032
6033                 ctxt->c_first_suballocator = fl;
6034         }
6035         return fl;
6036 }
6037
6038 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6039                                      int type, int slot, u64 blkno,
6040                                      unsigned int bit)
6041 {
6042         int ret;
6043         struct ocfs2_per_slot_free_list *fl;
6044         struct ocfs2_cached_block_free *item;
6045
6046         fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
6047         if (fl == NULL) {
6048                 ret = -ENOMEM;
6049                 mlog_errno(ret);
6050                 goto out;
6051         }
6052
6053         item = kmalloc(sizeof(*item), GFP_NOFS);
6054         if (item == NULL) {
6055                 ret = -ENOMEM;
6056                 mlog_errno(ret);
6057                 goto out;
6058         }
6059
6060         mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
6061              type, slot, bit, (unsigned long long)blkno);
6062
6063         item->free_blk = blkno;
6064         item->free_bit = bit;
6065         item->free_next = fl->f_first;
6066
6067         fl->f_first = item;
6068
6069         ret = 0;
6070 out:
6071         return ret;
6072 }
6073
6074 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6075                                          struct ocfs2_extent_block *eb)
6076 {
6077         return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6078                                          le16_to_cpu(eb->h_suballoc_slot),
6079                                          le64_to_cpu(eb->h_blkno),
6080                                          le16_to_cpu(eb->h_suballoc_bit));
6081 }
6082
6083 /* This function will figure out whether the currently last extent
6084  * block will be deleted, and if it will, what the new last extent
6085  * block will be so we can update his h_next_leaf_blk field, as well
6086  * as the dinodes i_last_eb_blk */
6087 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6088                                        unsigned int clusters_to_del,
6089                                        struct ocfs2_path *path,
6090                                        struct buffer_head **new_last_eb)
6091 {
6092         int next_free, ret = 0;
6093         u32 cpos;
6094         struct ocfs2_extent_rec *rec;
6095         struct ocfs2_extent_block *eb;
6096         struct ocfs2_extent_list *el;
6097         struct buffer_head *bh = NULL;
6098
6099         *new_last_eb = NULL;
6100
6101         /* we have no tree, so of course, no last_eb. */
6102         if (!path->p_tree_depth)
6103                 goto out;
6104
6105         /* trunc to zero special case - this makes tree_depth = 0
6106          * regardless of what it is.  */
6107         if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6108                 goto out;
6109
6110         el = path_leaf_el(path);
6111         BUG_ON(!el->l_next_free_rec);
6112
6113         /*
6114          * Make sure that this extent list will actually be empty
6115          * after we clear away the data. We can shortcut out if
6116          * there's more than one non-empty extent in the
6117          * list. Otherwise, a check of the remaining extent is
6118          * necessary.
6119          */
6120         next_free = le16_to_cpu(el->l_next_free_rec);
6121         rec = NULL;
6122         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6123                 if (next_free > 2)
6124                         goto out;
6125
6126                 /* We may have a valid extent in index 1, check it. */
6127                 if (next_free == 2)
6128                         rec = &el->l_recs[1];
6129
6130                 /*
6131                  * Fall through - no more nonempty extents, so we want
6132                  * to delete this leaf.
6133                  */
6134         } else {
6135                 if (next_free > 1)
6136                         goto out;
6137
6138                 rec = &el->l_recs[0];
6139         }
6140
6141         if (rec) {
6142                 /*
6143                  * Check it we'll only be trimming off the end of this
6144                  * cluster.
6145                  */
6146                 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6147                         goto out;
6148         }
6149
6150         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6151         if (ret) {
6152                 mlog_errno(ret);
6153                 goto out;
6154         }
6155
6156         ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
6157         if (ret) {
6158                 mlog_errno(ret);
6159                 goto out;
6160         }
6161
6162         eb = (struct ocfs2_extent_block *) bh->b_data;
6163         el = &eb->h_list;
6164         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6165                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6166                 ret = -EROFS;
6167                 goto out;
6168         }
6169
6170         *new_last_eb = bh;
6171         get_bh(*new_last_eb);
6172         mlog(0, "returning block %llu, (cpos: %u)\n",
6173              (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6174 out:
6175         brelse(bh);
6176
6177         return ret;
6178 }
6179
6180 /*
6181  * Trim some clusters off the rightmost edge of a tree. Only called
6182  * during truncate.
6183  *
6184  * The caller needs to:
6185  *   - start journaling of each path component.
6186  *   - compute and fully set up any new last ext block
6187  */
6188 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6189                            handle_t *handle, struct ocfs2_truncate_context *tc,
6190                            u32 clusters_to_del, u64 *delete_start)
6191 {
6192         int ret, i, index = path->p_tree_depth;
6193         u32 new_edge = 0;
6194         u64 deleted_eb = 0;
6195         struct buffer_head *bh;
6196         struct ocfs2_extent_list *el;
6197         struct ocfs2_extent_rec *rec;
6198
6199         *delete_start = 0;
6200
6201         while (index >= 0) {
6202                 bh = path->p_node[index].bh;
6203                 el = path->p_node[index].el;
6204
6205                 mlog(0, "traveling tree (index = %d, block = %llu)\n",
6206                      index,  (unsigned long long)bh->b_blocknr);
6207
6208                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6209
6210                 if (index !=
6211                     (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6212                         ocfs2_error(inode->i_sb,
6213                                     "Inode %lu has invalid ext. block %llu",
6214                                     inode->i_ino,
6215                                     (unsigned long long)bh->b_blocknr);
6216                         ret = -EROFS;
6217                         goto out;
6218                 }
6219
6220 find_tail_record:
6221                 i = le16_to_cpu(el->l_next_free_rec) - 1;
6222                 rec = &el->l_recs[i];
6223
6224                 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6225                      "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6226                      ocfs2_rec_clusters(el, rec),
6227                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6228                      le16_to_cpu(el->l_next_free_rec));
6229
6230                 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6231
6232                 if (le16_to_cpu(el->l_tree_depth) == 0) {
6233                         /*
6234                          * If the leaf block contains a single empty
6235                          * extent and no records, we can just remove
6236                          * the block.
6237                          */
6238                         if (i == 0 && ocfs2_is_empty_extent(rec)) {
6239                                 memset(rec, 0,
6240                                        sizeof(struct ocfs2_extent_rec));
6241                                 el->l_next_free_rec = cpu_to_le16(0);
6242
6243                                 goto delete;
6244                         }
6245
6246                         /*
6247                          * Remove any empty extents by shifting things
6248                          * left. That should make life much easier on
6249                          * the code below. This condition is rare
6250                          * enough that we shouldn't see a performance
6251                          * hit.
6252                          */
6253                         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6254                                 le16_add_cpu(&el->l_next_free_rec, -1);
6255
6256                                 for(i = 0;
6257                                     i < le16_to_cpu(el->l_next_free_rec); i++)
6258                                         el->l_recs[i] = el->l_recs[i + 1];
6259
6260                                 memset(&el->l_recs[i], 0,
6261                                        sizeof(struct ocfs2_extent_rec));
6262
6263                                 /*
6264                                  * We've modified our extent list. The
6265                                  * simplest way to handle this change
6266                                  * is to being the search from the
6267                                  * start again.
6268                                  */
6269                                 goto find_tail_record;
6270                         }
6271
6272                         le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6273
6274                         /*
6275                          * We'll use "new_edge" on our way back up the
6276                          * tree to know what our rightmost cpos is.
6277                          */
6278                         new_edge = le16_to_cpu(rec->e_leaf_clusters);
6279                         new_edge += le32_to_cpu(rec->e_cpos);
6280
6281                         /*
6282                          * The caller will use this to delete data blocks.
6283                          */
6284                         *delete_start = le64_to_cpu(rec->e_blkno)
6285                                 + ocfs2_clusters_to_blocks(inode->i_sb,
6286                                         le16_to_cpu(rec->e_leaf_clusters));
6287
6288                         /*
6289                          * If it's now empty, remove this record.
6290                          */
6291                         if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6292                                 memset(rec, 0,
6293                                        sizeof(struct ocfs2_extent_rec));
6294                                 le16_add_cpu(&el->l_next_free_rec, -1);
6295                         }
6296                 } else {
6297                         if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6298                                 memset(rec, 0,
6299                                        sizeof(struct ocfs2_extent_rec));
6300                                 le16_add_cpu(&el->l_next_free_rec, -1);
6301
6302                                 goto delete;
6303                         }
6304
6305                         /* Can this actually happen? */
6306                         if (le16_to_cpu(el->l_next_free_rec) == 0)
6307                                 goto delete;
6308
6309                         /*
6310                          * We never actually deleted any clusters
6311                          * because our leaf was empty. There's no
6312                          * reason to adjust the rightmost edge then.
6313                          */
6314                         if (new_edge == 0)
6315                                 goto delete;
6316
6317                         rec->e_int_clusters = cpu_to_le32(new_edge);
6318                         le32_add_cpu(&rec->e_int_clusters,
6319                                      -le32_to_cpu(rec->e_cpos));
6320
6321                          /*
6322                           * A deleted child record should have been
6323                           * caught above.
6324                           */
6325                          BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6326                 }
6327
6328 delete:
6329                 ret = ocfs2_journal_dirty(handle, bh);
6330                 if (ret) {
6331                         mlog_errno(ret);
6332                         goto out;
6333                 }
6334
6335                 mlog(0, "extent list container %llu, after: record %d: "
6336                      "(%u, %u, %llu), next = %u.\n",
6337                      (unsigned long long)bh->b_blocknr, i,
6338                      le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6339                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6340                      le16_to_cpu(el->l_next_free_rec));
6341
6342                 /*
6343                  * We must be careful to only attempt delete of an
6344                  * extent block (and not the root inode block).
6345                  */
6346                 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6347                         struct ocfs2_extent_block *eb =
6348                                 (struct ocfs2_extent_block *)bh->b_data;
6349
6350                         /*
6351                          * Save this for use when processing the
6352                          * parent block.
6353                          */
6354                         deleted_eb = le64_to_cpu(eb->h_blkno);
6355
6356                         mlog(0, "deleting this extent block.\n");
6357
6358                         ocfs2_remove_from_cache(inode, bh);
6359
6360                         BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6361                         BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6362                         BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6363
6364                         ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6365                         /* An error here is not fatal. */
6366                         if (ret < 0)
6367                                 mlog_errno(ret);
6368                 } else {
6369                         deleted_eb = 0;
6370                 }
6371
6372                 index--;
6373         }
6374
6375         ret = 0;
6376 out:
6377         return ret;
6378 }
6379
6380 static int ocfs2_do_truncate(struct ocfs2_super *osb,
6381                              unsigned int clusters_to_del,
6382                              struct inode *inode,
6383                              struct buffer_head *fe_bh,
6384                              handle_t *handle,
6385                              struct ocfs2_truncate_context *tc,
6386                              struct ocfs2_path *path)
6387 {
6388         int status;
6389         struct ocfs2_dinode *fe;
6390         struct ocfs2_extent_block *last_eb = NULL;
6391         struct ocfs2_extent_list *el;
6392         struct buffer_head *last_eb_bh = NULL;
6393         u64 delete_blk = 0;
6394
6395         fe = (struct ocfs2_dinode *) fe_bh->b_data;
6396
6397         status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6398                                              path, &last_eb_bh);
6399         if (status < 0) {
6400                 mlog_errno(status);
6401                 goto bail;
6402         }
6403
6404         /*
6405          * Each component will be touched, so we might as well journal
6406          * here to avoid having to handle errors later.
6407          */
6408         status = ocfs2_journal_access_path(inode, handle, path);
6409         if (status < 0) {
6410                 mlog_errno(status);
6411                 goto bail;
6412         }
6413
6414         if (last_eb_bh) {
6415                 status = ocfs2_journal_access(handle, inode, last_eb_bh,
6416                                               OCFS2_JOURNAL_ACCESS_WRITE);
6417                 if (status < 0) {
6418                         mlog_errno(status);
6419                         goto bail;
6420                 }
6421
6422                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6423         }
6424
6425         el = &(fe->id2.i_list);
6426
6427         /*
6428          * Lower levels depend on this never happening, but it's best
6429          * to check it up here before changing the tree.
6430          */
6431         if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6432                 ocfs2_error(inode->i_sb,
6433                             "Inode %lu has an empty extent record, depth %u\n",
6434                             inode->i_ino, le16_to_cpu(el->l_tree_depth));
6435                 status = -EROFS;
6436                 goto bail;
6437         }
6438
6439         spin_lock(&OCFS2_I(inode)->ip_lock);
6440         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6441                                       clusters_to_del;
6442         spin_unlock(&OCFS2_I(inode)->ip_lock);
6443         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6444         inode->i_blocks = ocfs2_inode_sector_count(inode);
6445
6446         status = ocfs2_trim_tree(inode, path, handle, tc,
6447                                  clusters_to_del, &delete_blk);
6448         if (status) {
6449                 mlog_errno(status);
6450                 goto bail;
6451         }
6452
6453         if (le32_to_cpu(fe->i_clusters) == 0) {
6454                 /* trunc to zero is a special case. */
6455                 el->l_tree_depth = 0;
6456                 fe->i_last_eb_blk = 0;
6457         } else if (last_eb)
6458                 fe->i_last_eb_blk = last_eb->h_blkno;
6459
6460         status = ocfs2_journal_dirty(handle, fe_bh);
6461         if (status < 0) {
6462                 mlog_errno(status);
6463                 goto bail;
6464         }
6465
6466         if (last_eb) {
6467                 /* If there will be a new last extent block, then by
6468                  * definition, there cannot be any leaves to the right of
6469                  * him. */
6470                 last_eb->h_next_leaf_blk = 0;
6471                 status = ocfs2_journal_dirty(handle, last_eb_bh);
6472                 if (status < 0) {
6473                         mlog_errno(status);
6474                         goto bail;
6475                 }
6476         }
6477
6478         if (delete_blk) {
6479                 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
6480                                                    clusters_to_del);
6481                 if (status < 0) {
6482                         mlog_errno(status);
6483                         goto bail;
6484                 }
6485         }
6486         status = 0;
6487 bail:
6488
6489         mlog_exit(status);
6490         return status;
6491 }
6492
6493 static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
6494 {
6495         set_buffer_uptodate(bh);
6496         mark_buffer_dirty(bh);
6497         return 0;
6498 }
6499
6500 static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
6501 {
6502         set_buffer_uptodate(bh);
6503         mark_buffer_dirty(bh);
6504         return ocfs2_journal_dirty_data(handle, bh);
6505 }
6506
6507 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6508                                      unsigned int from, unsigned int to,
6509                                      struct page *page, int zero, u64 *phys)
6510 {
6511         int ret, partial = 0;
6512
6513         ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6514         if (ret)
6515                 mlog_errno(ret);
6516
6517         if (zero)
6518                 zero_user_segment(page, from, to);
6519
6520         /*
6521          * Need to set the buffers we zero'd into uptodate
6522          * here if they aren't - ocfs2_map_page_blocks()
6523          * might've skipped some
6524          */
6525         if (ocfs2_should_order_data(inode)) {
6526                 ret = walk_page_buffers(handle,
6527                                         page_buffers(page),
6528                                         from, to, &partial,
6529                                         ocfs2_ordered_zero_func);
6530                 if (ret < 0)
6531                         mlog_errno(ret);
6532         } else {
6533                 ret = walk_page_buffers(handle, page_buffers(page),
6534                                         from, to, &partial,
6535                                         ocfs2_writeback_zero_func);
6536                 if (ret < 0)
6537                         mlog_errno(ret);
6538         }
6539
6540         if (!partial)
6541                 SetPageUptodate(page);
6542
6543         flush_dcache_page(page);
6544 }
6545
6546 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6547                                      loff_t end, struct page **pages,
6548                                      int numpages, u64 phys, handle_t *handle)
6549 {
6550         int i;
6551         struct page *page;
6552         unsigned int from, to = PAGE_CACHE_SIZE;
6553         struct super_block *sb = inode->i_sb;
6554
6555         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6556
6557         if (numpages == 0)
6558                 goto out;
6559
6560         to = PAGE_CACHE_SIZE;
6561         for(i = 0; i < numpages; i++) {
6562                 page = pages[i];
6563
6564                 from = start & (PAGE_CACHE_SIZE - 1);
6565                 if ((end >> PAGE_CACHE_SHIFT) == page->index)
6566                         to = end & (PAGE_CACHE_SIZE - 1);
6567
6568                 BUG_ON(from > PAGE_CACHE_SIZE);
6569                 BUG_ON(to > PAGE_CACHE_SIZE);
6570
6571                 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6572                                          &phys);
6573
6574                 start = (page->index + 1) << PAGE_CACHE_SHIFT;
6575         }
6576 out:
6577         if (pages)
6578                 ocfs2_unlock_and_free_pages(pages, numpages);
6579 }
6580
6581 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6582                                 struct page **pages, int *num)
6583 {
6584         int numpages, ret = 0;
6585         struct super_block *sb = inode->i_sb;
6586         struct address_space *mapping = inode->i_mapping;
6587         unsigned long index;
6588         loff_t last_page_bytes;
6589
6590         BUG_ON(start > end);
6591
6592         BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6593                (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6594
6595         numpages = 0;
6596         last_page_bytes = PAGE_ALIGN(end);
6597         index = start >> PAGE_CACHE_SHIFT;
6598         do {
6599                 pages[numpages] = grab_cache_page(mapping, index);
6600                 if (!pages[numpages]) {
6601                         ret = -ENOMEM;
6602                         mlog_errno(ret);
6603                         goto out;
6604                 }
6605
6606                 numpages++;
6607                 index++;
6608         } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6609
6610 out:
6611         if (ret != 0) {
6612                 if (pages)
6613                         ocfs2_unlock_and_free_pages(pages, numpages);
6614                 numpages = 0;
6615         }
6616
6617         *num = numpages;
6618
6619         return ret;
6620 }
6621
6622 /*
6623  * Zero the area past i_size but still within an allocated
6624  * cluster. This avoids exposing nonzero data on subsequent file
6625  * extends.
6626  *
6627  * We need to call this before i_size is updated on the inode because
6628  * otherwise block_write_full_page() will skip writeout of pages past
6629  * i_size. The new_i_size parameter is passed for this reason.
6630  */
6631 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6632                                   u64 range_start, u64 range_end)
6633 {
6634         int ret = 0, numpages;
6635         struct page **pages = NULL;
6636         u64 phys;
6637         unsigned int ext_flags;
6638         struct super_block *sb = inode->i_sb;
6639
6640         /*
6641          * File systems which don't support sparse files zero on every
6642          * extend.
6643          */
6644         if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6645                 return 0;
6646
6647         pages = kcalloc(ocfs2_pages_per_cluster(sb),
6648                         sizeof(struct page *), GFP_NOFS);
6649         if (pages == NULL) {
6650                 ret = -ENOMEM;
6651                 mlog_errno(ret);
6652                 goto out;
6653         }
6654
6655         if (range_start == range_end)
6656                 goto out;
6657
6658         ret = ocfs2_extent_map_get_blocks(inode,
6659                                           range_start >> sb->s_blocksize_bits,
6660                                           &phys, NULL, &ext_flags);
6661         if (ret) {
6662                 mlog_errno(ret);
6663                 goto out;
6664         }
6665
6666         /*
6667          * Tail is a hole, or is marked unwritten. In either case, we
6668          * can count on read and write to return/push zero's.
6669          */
6670         if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6671                 goto out;
6672
6673         ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6674                                    &numpages);
6675         if (ret) {
6676                 mlog_errno(ret);
6677                 goto out;
6678         }
6679
6680         ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6681                                  numpages, phys, handle);
6682
6683         /*
6684          * Initiate writeout of the pages we zero'd here. We don't
6685          * wait on them - the truncate_inode_pages() call later will
6686          * do that for us.
6687          */
6688         ret = do_sync_mapping_range(inode->i_mapping, range_start,
6689                                     range_end - 1, SYNC_FILE_RANGE_WRITE);
6690         if (ret)
6691                 mlog_errno(ret);
6692
6693 out:
6694         if (pages)
6695                 kfree(pages);
6696
6697         return ret;
6698 }
6699
6700 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
6701                                              struct ocfs2_dinode *di)
6702 {
6703         unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6704         unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
6705
6706         if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
6707                 memset(&di->id2, 0, blocksize -
6708                                     offsetof(struct ocfs2_dinode, id2) -
6709                                     xattrsize);
6710         else
6711                 memset(&di->id2, 0, blocksize -
6712                                     offsetof(struct ocfs2_dinode, id2));
6713 }
6714
6715 void ocfs2_dinode_new_extent_list(struct inode *inode,
6716                                   struct ocfs2_dinode *di)
6717 {
6718         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6719         di->id2.i_list.l_tree_depth = 0;
6720         di->id2.i_list.l_next_free_rec = 0;
6721         di->id2.i_list.l_count = cpu_to_le16(
6722                 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
6723 }
6724
6725 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6726 {
6727         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6728         struct ocfs2_inline_data *idata = &di->id2.i_data;
6729
6730         spin_lock(&oi->ip_lock);
6731         oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6732         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6733         spin_unlock(&oi->ip_lock);
6734
6735         /*
6736          * We clear the entire i_data structure here so that all
6737          * fields can be properly initialized.
6738          */
6739         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6740
6741         idata->id_count = cpu_to_le16(
6742                         ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
6743 }
6744
6745 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6746                                          struct buffer_head *di_bh)
6747 {
6748         int ret, i, has_data, num_pages = 0;
6749         handle_t *handle;
6750         u64 uninitialized_var(block);
6751         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6752         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6753         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6754         struct ocfs2_alloc_context *data_ac = NULL;
6755         struct page **pages = NULL;
6756         loff_t end = osb->s_clustersize;
6757
6758         has_data = i_size_read(inode) ? 1 : 0;
6759
6760         if (has_data) {
6761                 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6762                                 sizeof(struct page *), GFP_NOFS);
6763                 if (pages == NULL) {
6764                         ret = -ENOMEM;
6765                         mlog_errno(ret);
6766                         goto out;
6767                 }
6768
6769                 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6770                 if (ret) {
6771                         mlog_errno(ret);
6772                         goto out;
6773                 }
6774         }
6775
6776         handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
6777         if (IS_ERR(handle)) {
6778                 ret = PTR_ERR(handle);
6779                 mlog_errno(ret);
6780                 goto out_unlock;
6781         }
6782
6783         ret = ocfs2_journal_access(handle, inode, di_bh,
6784                                    OCFS2_JOURNAL_ACCESS_WRITE);
6785         if (ret) {
6786                 mlog_errno(ret);
6787                 goto out_commit;
6788         }
6789
6790         if (has_data) {
6791                 u32 bit_off, num;
6792                 unsigned int page_end;
6793                 u64 phys;
6794
6795                 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6796                                            &num);
6797                 if (ret) {
6798                         mlog_errno(ret);
6799                         goto out_commit;
6800                 }
6801
6802                 /*
6803                  * Save two copies, one for insert, and one that can
6804                  * be changed by ocfs2_map_and_dirty_page() below.
6805                  */
6806                 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6807
6808                 /*
6809                  * Non sparse file systems zero on extend, so no need
6810                  * to do that now.
6811                  */
6812                 if (!ocfs2_sparse_alloc(osb) &&
6813                     PAGE_CACHE_SIZE < osb->s_clustersize)
6814                         end = PAGE_CACHE_SIZE;
6815
6816                 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6817                 if (ret) {
6818                         mlog_errno(ret);
6819                         goto out_commit;
6820                 }
6821
6822                 /*
6823                  * This should populate the 1st page for us and mark
6824                  * it up to date.
6825                  */
6826                 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6827                 if (ret) {
6828                         mlog_errno(ret);
6829                         goto out_commit;
6830                 }
6831
6832                 page_end = PAGE_CACHE_SIZE;
6833                 if (PAGE_CACHE_SIZE > osb->s_clustersize)
6834                         page_end = osb->s_clustersize;
6835
6836                 for (i = 0; i < num_pages; i++)
6837                         ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
6838                                                  pages[i], i > 0, &phys);
6839         }
6840
6841         spin_lock(&oi->ip_lock);
6842         oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
6843         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6844         spin_unlock(&oi->ip_lock);
6845
6846         ocfs2_dinode_new_extent_list(inode, di);
6847
6848         ocfs2_journal_dirty(handle, di_bh);
6849
6850         if (has_data) {
6851                 /*
6852                  * An error at this point should be extremely rare. If
6853                  * this proves to be false, we could always re-build
6854                  * the in-inode data from our pages.
6855                  */
6856                 ret = ocfs2_dinode_insert_extent(osb, handle, inode, di_bh,
6857                                                  0, block, 1, 0, NULL);
6858                 if (ret) {
6859                         mlog_errno(ret);
6860                         goto out_commit;
6861                 }
6862
6863                 inode->i_blocks = ocfs2_inode_sector_count(inode);
6864         }
6865
6866 out_commit:
6867         ocfs2_commit_trans(osb, handle);
6868
6869 out_unlock:
6870         if (data_ac)
6871                 ocfs2_free_alloc_context(data_ac);
6872
6873 out:
6874         if (pages) {
6875                 ocfs2_unlock_and_free_pages(pages, num_pages);
6876                 kfree(pages);
6877         }
6878
6879         return ret;
6880 }
6881
6882 /*
6883  * It is expected, that by the time you call this function,
6884  * inode->i_size and fe->i_size have been adjusted.
6885  *
6886  * WARNING: This will kfree the truncate context
6887  */
6888 int ocfs2_commit_truncate(struct ocfs2_super *osb,
6889                           struct inode *inode,
6890                           struct buffer_head *fe_bh,
6891                           struct ocfs2_truncate_context *tc)
6892 {
6893         int status, i, credits, tl_sem = 0;
6894         u32 clusters_to_del, new_highest_cpos, range;
6895         struct ocfs2_extent_list *el;
6896         handle_t *handle = NULL;
6897         struct inode *tl_inode = osb->osb_tl_inode;
6898         struct ocfs2_path *path = NULL;
6899         struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
6900
6901         mlog_entry_void();
6902
6903         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6904                                                      i_size_read(inode));
6905
6906         path = ocfs2_new_path(fe_bh, &di->id2.i_list);
6907         if (!path) {
6908                 status = -ENOMEM;
6909                 mlog_errno(status);
6910                 goto bail;
6911         }
6912
6913         ocfs2_extent_map_trunc(inode, new_highest_cpos);
6914
6915 start:
6916         /*
6917          * Check that we still have allocation to delete.
6918          */
6919         if (OCFS2_I(inode)->ip_clusters == 0) {
6920                 status = 0;
6921                 goto bail;
6922         }
6923
6924         /*
6925          * Truncate always works against the rightmost tree branch.
6926          */
6927         status = ocfs2_find_path(inode, path, UINT_MAX);
6928         if (status) {
6929                 mlog_errno(status);
6930                 goto bail;
6931         }
6932
6933         mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
6934              OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
6935
6936         /*
6937          * By now, el will point to the extent list on the bottom most
6938          * portion of this tree. Only the tail record is considered in
6939          * each pass.
6940          *
6941          * We handle the following cases, in order:
6942          * - empty extent: delete the remaining branch
6943          * - remove the entire record
6944          * - remove a partial record
6945          * - no record needs to be removed (truncate has completed)
6946          */
6947         el = path_leaf_el(path);
6948         if (le16_to_cpu(el->l_next_free_rec) == 0) {
6949                 ocfs2_error(inode->i_sb,
6950                             "Inode %llu has empty extent block at %llu\n",
6951                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
6952                             (unsigned long long)path_leaf_bh(path)->b_blocknr);
6953                 status = -EROFS;
6954                 goto bail;
6955         }
6956
6957         i = le16_to_cpu(el->l_next_free_rec) - 1;
6958         range = le32_to_cpu(el->l_recs[i].e_cpos) +
6959                 ocfs2_rec_clusters(el, &el->l_recs[i]);
6960         if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
6961                 clusters_to_del = 0;
6962         } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
6963                 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
6964         } else if (range > new_highest_cpos) {
6965                 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
6966                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
6967                                   new_highest_cpos;
6968         } else {
6969                 status = 0;
6970                 goto bail;
6971         }
6972
6973         mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
6974              clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
6975
6976         mutex_lock(&tl_inode->i_mutex);
6977         tl_sem = 1;
6978         /* ocfs2_truncate_log_needs_flush guarantees us at least one
6979          * record is free for use. If there isn't any, we flush to get
6980          * an empty truncate log.  */
6981         if (ocfs2_truncate_log_needs_flush(osb)) {
6982                 status = __ocfs2_flush_truncate_log(osb);
6983                 if (status < 0) {
6984                         mlog_errno(status);
6985                         goto bail;
6986                 }
6987         }
6988
6989         credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
6990                                                 (struct ocfs2_dinode *)fe_bh->b_data,
6991                                                 el);
6992         handle = ocfs2_start_trans(osb, credits);
6993         if (IS_ERR(handle)) {
6994                 status = PTR_ERR(handle);
6995                 handle = NULL;
6996                 mlog_errno(status);
6997                 goto bail;
6998         }
6999
7000         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
7001                                    tc, path);
7002         if (status < 0) {
7003                 mlog_errno(status);
7004                 goto bail;
7005         }
7006
7007         mutex_unlock(&tl_inode->i_mutex);
7008         tl_sem = 0;
7009
7010         ocfs2_commit_trans(osb, handle);
7011         handle = NULL;
7012
7013         ocfs2_reinit_path(path, 1);
7014
7015         /*
7016          * The check above will catch the case where we've truncated
7017          * away all allocation.
7018          */
7019         goto start;
7020
7021 bail:
7022
7023         ocfs2_schedule_truncate_log_flush(osb, 1);
7024
7025         if (tl_sem)
7026                 mutex_unlock(&tl_inode->i_mutex);
7027
7028         if (handle)
7029                 ocfs2_commit_trans(osb, handle);
7030
7031         ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7032
7033         ocfs2_free_path(path);
7034
7035         /* This will drop the ext_alloc cluster lock for us */
7036         ocfs2_free_truncate_context(tc);
7037
7038         mlog_exit(status);
7039         return status;
7040 }
7041
7042 /*
7043  * Expects the inode to already be locked.
7044  */
7045 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
7046                            struct inode *inode,
7047                            struct buffer_head *fe_bh,
7048                            struct ocfs2_truncate_context **tc)
7049 {
7050         int status;
7051         unsigned int new_i_clusters;
7052         struct ocfs2_dinode *fe;
7053         struct ocfs2_extent_block *eb;
7054         struct buffer_head *last_eb_bh = NULL;
7055
7056         mlog_entry_void();
7057
7058         *tc = NULL;
7059
7060         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
7061                                                   i_size_read(inode));
7062         fe = (struct ocfs2_dinode *) fe_bh->b_data;
7063
7064         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
7065              "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
7066              (unsigned long long)le64_to_cpu(fe->i_size));
7067
7068         *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
7069         if (!(*tc)) {
7070                 status = -ENOMEM;
7071                 mlog_errno(status);
7072                 goto bail;
7073         }
7074         ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7075
7076         if (fe->id2.i_list.l_tree_depth) {
7077                 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
7078                                           &last_eb_bh, OCFS2_BH_CACHED, inode);
7079                 if (status < 0) {
7080                         mlog_errno(status);
7081                         goto bail;
7082                 }
7083                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
7084                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
7085                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
7086
7087                         brelse(last_eb_bh);
7088                         status = -EIO;
7089                         goto bail;
7090                 }
7091         }
7092
7093         (*tc)->tc_last_eb_bh = last_eb_bh;
7094
7095         status = 0;
7096 bail:
7097         if (status < 0) {
7098                 if (*tc)
7099                         ocfs2_free_truncate_context(*tc);
7100                 *tc = NULL;
7101         }
7102         mlog_exit_void();
7103         return status;
7104 }
7105
7106 /*
7107  * 'start' is inclusive, 'end' is not.
7108  */
7109 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7110                           unsigned int start, unsigned int end, int trunc)
7111 {
7112         int ret;
7113         unsigned int numbytes;
7114         handle_t *handle;
7115         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7116         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7117         struct ocfs2_inline_data *idata = &di->id2.i_data;
7118
7119         if (end > i_size_read(inode))
7120                 end = i_size_read(inode);
7121
7122         BUG_ON(start >= end);
7123
7124         if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7125             !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7126             !ocfs2_supports_inline_data(osb)) {
7127                 ocfs2_error(inode->i_sb,
7128                             "Inline data flags for inode %llu don't agree! "
7129                             "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7130                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7131                             le16_to_cpu(di->i_dyn_features),
7132                             OCFS2_I(inode)->ip_dyn_features,
7133                             osb->s_feature_incompat);
7134                 ret = -EROFS;
7135                 goto out;
7136         }
7137
7138         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7139         if (IS_ERR(handle)) {
7140                 ret = PTR_ERR(handle);
7141                 mlog_errno(ret);
7142                 goto out;
7143         }
7144
7145         ret = ocfs2_journal_access(handle, inode, di_bh,
7146                                    OCFS2_JOURNAL_ACCESS_WRITE);
7147         if (ret) {
7148                 mlog_errno(ret);
7149                 goto out_commit;
7150         }
7151
7152         numbytes = end - start;
7153         memset(idata->id_data + start, 0, numbytes);
7154
7155         /*
7156          * No need to worry about the data page here - it's been
7157          * truncated already and inline data doesn't need it for
7158          * pushing zero's to disk, so we'll let readpage pick it up
7159          * later.
7160          */
7161         if (trunc) {
7162                 i_size_write(inode, start);
7163                 di->i_size = cpu_to_le64(start);
7164         }
7165
7166         inode->i_blocks = ocfs2_inode_sector_count(inode);
7167         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7168
7169         di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7170         di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7171
7172         ocfs2_journal_dirty(handle, di_bh);
7173
7174 out_commit:
7175         ocfs2_commit_trans(osb, handle);
7176
7177 out:
7178         return ret;
7179 }
7180
7181 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7182 {
7183         /*
7184          * The caller is responsible for completing deallocation
7185          * before freeing the context.
7186          */
7187         if (tc->tc_dealloc.c_first_suballocator != NULL)
7188                 mlog(ML_NOTICE,
7189                      "Truncate completion has non-empty dealloc context\n");
7190
7191         if (tc->tc_last_eb_bh)
7192                 brelse(tc->tc_last_eb_bh);
7193
7194         kfree(tc);
7195 }