ocfs2: Morph the haphazard OCFS2_IS_VALID_DINODE() checks.
[safe/jmp/linux-2.6] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "aops.h"
39 #include "dlmglue.h"
40 #include "extent_map.h"
41 #include "inode.h"
42 #include "journal.h"
43 #include "localalloc.h"
44 #include "suballoc.h"
45 #include "sysfile.h"
46 #include "file.h"
47 #include "super.h"
48 #include "uptodate.h"
49
50 #include "buffer_head_io.h"
51
52
53 /*
54  * Operations for a specific extent tree type.
55  *
56  * To implement an on-disk btree (extent tree) type in ocfs2, add
57  * an ocfs2_extent_tree_operations structure and the matching
58  * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
59  * for the allocation portion of the extent tree.
60  */
61 struct ocfs2_extent_tree_operations {
62         /*
63          * last_eb_blk is the block number of the right most leaf extent
64          * block.  Most on-disk structures containing an extent tree store
65          * this value for fast access.  The ->eo_set_last_eb_blk() and
66          * ->eo_get_last_eb_blk() operations access this value.  They are
67          *  both required.
68          */
69         void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
70                                    u64 blkno);
71         u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
72
73         /*
74          * The on-disk structure usually keeps track of how many total
75          * clusters are stored in this extent tree.  This function updates
76          * that value.  new_clusters is the delta, and must be
77          * added to the total.  Required.
78          */
79         void (*eo_update_clusters)(struct inode *inode,
80                                    struct ocfs2_extent_tree *et,
81                                    u32 new_clusters);
82
83         /*
84          * If ->eo_insert_check() exists, it is called before rec is
85          * inserted into the extent tree.  It is optional.
86          */
87         int (*eo_insert_check)(struct inode *inode,
88                                struct ocfs2_extent_tree *et,
89                                struct ocfs2_extent_rec *rec);
90         int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
91
92         /*
93          * --------------------------------------------------------------
94          * The remaining are internal to ocfs2_extent_tree and don't have
95          * accessor functions
96          */
97
98         /*
99          * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
100          * It is required.
101          */
102         void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
103
104         /*
105          * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
106          * it exists.  If it does not, et->et_max_leaf_clusters is set
107          * to 0 (unlimited).  Optional.
108          */
109         void (*eo_fill_max_leaf_clusters)(struct inode *inode,
110                                           struct ocfs2_extent_tree *et);
111 };
112
113
114 /*
115  * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
116  * in the methods.
117  */
118 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
119 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
120                                          u64 blkno);
121 static void ocfs2_dinode_update_clusters(struct inode *inode,
122                                          struct ocfs2_extent_tree *et,
123                                          u32 clusters);
124 static int ocfs2_dinode_insert_check(struct inode *inode,
125                                      struct ocfs2_extent_tree *et,
126                                      struct ocfs2_extent_rec *rec);
127 static int ocfs2_dinode_sanity_check(struct inode *inode,
128                                      struct ocfs2_extent_tree *et);
129 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
130 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
131         .eo_set_last_eb_blk     = ocfs2_dinode_set_last_eb_blk,
132         .eo_get_last_eb_blk     = ocfs2_dinode_get_last_eb_blk,
133         .eo_update_clusters     = ocfs2_dinode_update_clusters,
134         .eo_insert_check        = ocfs2_dinode_insert_check,
135         .eo_sanity_check        = ocfs2_dinode_sanity_check,
136         .eo_fill_root_el        = ocfs2_dinode_fill_root_el,
137 };
138
139 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
140                                          u64 blkno)
141 {
142         struct ocfs2_dinode *di = et->et_object;
143
144         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
145         di->i_last_eb_blk = cpu_to_le64(blkno);
146 }
147
148 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
149 {
150         struct ocfs2_dinode *di = et->et_object;
151
152         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
153         return le64_to_cpu(di->i_last_eb_blk);
154 }
155
156 static void ocfs2_dinode_update_clusters(struct inode *inode,
157                                          struct ocfs2_extent_tree *et,
158                                          u32 clusters)
159 {
160         struct ocfs2_dinode *di = et->et_object;
161
162         le32_add_cpu(&di->i_clusters, clusters);
163         spin_lock(&OCFS2_I(inode)->ip_lock);
164         OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
165         spin_unlock(&OCFS2_I(inode)->ip_lock);
166 }
167
168 static int ocfs2_dinode_insert_check(struct inode *inode,
169                                      struct ocfs2_extent_tree *et,
170                                      struct ocfs2_extent_rec *rec)
171 {
172         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
173
174         BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
175         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
176                         (OCFS2_I(inode)->ip_clusters != rec->e_cpos),
177                         "Device %s, asking for sparse allocation: inode %llu, "
178                         "cpos %u, clusters %u\n",
179                         osb->dev_str,
180                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
181                         rec->e_cpos,
182                         OCFS2_I(inode)->ip_clusters);
183
184         return 0;
185 }
186
187 static int ocfs2_dinode_sanity_check(struct inode *inode,
188                                      struct ocfs2_extent_tree *et)
189 {
190         struct ocfs2_dinode *di = et->et_object;
191
192         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
193         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
194
195         return 0;
196 }
197
198 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
199 {
200         struct ocfs2_dinode *di = et->et_object;
201
202         et->et_root_el = &di->id2.i_list;
203 }
204
205
206 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
207 {
208         struct ocfs2_xattr_value_root *xv = et->et_object;
209
210         et->et_root_el = &xv->xr_list;
211 }
212
213 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
214                                               u64 blkno)
215 {
216         struct ocfs2_xattr_value_root *xv =
217                 (struct ocfs2_xattr_value_root *)et->et_object;
218
219         xv->xr_last_eb_blk = cpu_to_le64(blkno);
220 }
221
222 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
223 {
224         struct ocfs2_xattr_value_root *xv =
225                 (struct ocfs2_xattr_value_root *) et->et_object;
226
227         return le64_to_cpu(xv->xr_last_eb_blk);
228 }
229
230 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
231                                               struct ocfs2_extent_tree *et,
232                                               u32 clusters)
233 {
234         struct ocfs2_xattr_value_root *xv =
235                 (struct ocfs2_xattr_value_root *)et->et_object;
236
237         le32_add_cpu(&xv->xr_clusters, clusters);
238 }
239
240 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
241         .eo_set_last_eb_blk     = ocfs2_xattr_value_set_last_eb_blk,
242         .eo_get_last_eb_blk     = ocfs2_xattr_value_get_last_eb_blk,
243         .eo_update_clusters     = ocfs2_xattr_value_update_clusters,
244         .eo_fill_root_el        = ocfs2_xattr_value_fill_root_el,
245 };
246
247 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
248 {
249         struct ocfs2_xattr_block *xb = et->et_object;
250
251         et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
252 }
253
254 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
255                                                     struct ocfs2_extent_tree *et)
256 {
257         et->et_max_leaf_clusters =
258                 ocfs2_clusters_for_bytes(inode->i_sb,
259                                          OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
260 }
261
262 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
263                                              u64 blkno)
264 {
265         struct ocfs2_xattr_block *xb = et->et_object;
266         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
267
268         xt->xt_last_eb_blk = cpu_to_le64(blkno);
269 }
270
271 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
272 {
273         struct ocfs2_xattr_block *xb = et->et_object;
274         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
275
276         return le64_to_cpu(xt->xt_last_eb_blk);
277 }
278
279 static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
280                                              struct ocfs2_extent_tree *et,
281                                              u32 clusters)
282 {
283         struct ocfs2_xattr_block *xb = et->et_object;
284
285         le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
286 }
287
288 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
289         .eo_set_last_eb_blk     = ocfs2_xattr_tree_set_last_eb_blk,
290         .eo_get_last_eb_blk     = ocfs2_xattr_tree_get_last_eb_blk,
291         .eo_update_clusters     = ocfs2_xattr_tree_update_clusters,
292         .eo_fill_root_el        = ocfs2_xattr_tree_fill_root_el,
293         .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
294 };
295
296 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
297                                      struct inode *inode,
298                                      struct buffer_head *bh,
299                                      void *obj,
300                                      struct ocfs2_extent_tree_operations *ops)
301 {
302         et->et_ops = ops;
303         et->et_root_bh = bh;
304         if (!obj)
305                 obj = (void *)bh->b_data;
306         et->et_object = obj;
307
308         et->et_ops->eo_fill_root_el(et);
309         if (!et->et_ops->eo_fill_max_leaf_clusters)
310                 et->et_max_leaf_clusters = 0;
311         else
312                 et->et_ops->eo_fill_max_leaf_clusters(inode, et);
313 }
314
315 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
316                                    struct inode *inode,
317                                    struct buffer_head *bh)
318 {
319         __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
320 }
321
322 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
323                                        struct inode *inode,
324                                        struct buffer_head *bh)
325 {
326         __ocfs2_init_extent_tree(et, inode, bh, NULL,
327                                  &ocfs2_xattr_tree_et_ops);
328 }
329
330 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
331                                         struct inode *inode,
332                                         struct buffer_head *bh,
333                                         struct ocfs2_xattr_value_root *xv)
334 {
335         __ocfs2_init_extent_tree(et, inode, bh, xv,
336                                  &ocfs2_xattr_value_et_ops);
337 }
338
339 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
340                                             u64 new_last_eb_blk)
341 {
342         et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
343 }
344
345 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
346 {
347         return et->et_ops->eo_get_last_eb_blk(et);
348 }
349
350 static inline void ocfs2_et_update_clusters(struct inode *inode,
351                                             struct ocfs2_extent_tree *et,
352                                             u32 clusters)
353 {
354         et->et_ops->eo_update_clusters(inode, et, clusters);
355 }
356
357 static inline int ocfs2_et_insert_check(struct inode *inode,
358                                         struct ocfs2_extent_tree *et,
359                                         struct ocfs2_extent_rec *rec)
360 {
361         int ret = 0;
362
363         if (et->et_ops->eo_insert_check)
364                 ret = et->et_ops->eo_insert_check(inode, et, rec);
365         return ret;
366 }
367
368 static inline int ocfs2_et_sanity_check(struct inode *inode,
369                                         struct ocfs2_extent_tree *et)
370 {
371         int ret = 0;
372
373         if (et->et_ops->eo_sanity_check)
374                 ret = et->et_ops->eo_sanity_check(inode, et);
375         return ret;
376 }
377
378 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
379 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
380                                          struct ocfs2_extent_block *eb);
381
382 /*
383  * Structures which describe a path through a btree, and functions to
384  * manipulate them.
385  *
386  * The idea here is to be as generic as possible with the tree
387  * manipulation code.
388  */
389 struct ocfs2_path_item {
390         struct buffer_head              *bh;
391         struct ocfs2_extent_list        *el;
392 };
393
394 #define OCFS2_MAX_PATH_DEPTH    5
395
396 struct ocfs2_path {
397         int                     p_tree_depth;
398         struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
399 };
400
401 #define path_root_bh(_path) ((_path)->p_node[0].bh)
402 #define path_root_el(_path) ((_path)->p_node[0].el)
403 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
404 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
405 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
406
407 /*
408  * Reset the actual path elements so that we can re-use the structure
409  * to build another path. Generally, this involves freeing the buffer
410  * heads.
411  */
412 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
413 {
414         int i, start = 0, depth = 0;
415         struct ocfs2_path_item *node;
416
417         if (keep_root)
418                 start = 1;
419
420         for(i = start; i < path_num_items(path); i++) {
421                 node = &path->p_node[i];
422
423                 brelse(node->bh);
424                 node->bh = NULL;
425                 node->el = NULL;
426         }
427
428         /*
429          * Tree depth may change during truncate, or insert. If we're
430          * keeping the root extent list, then make sure that our path
431          * structure reflects the proper depth.
432          */
433         if (keep_root)
434                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
435
436         path->p_tree_depth = depth;
437 }
438
439 static void ocfs2_free_path(struct ocfs2_path *path)
440 {
441         if (path) {
442                 ocfs2_reinit_path(path, 0);
443                 kfree(path);
444         }
445 }
446
447 /*
448  * All the elements of src into dest. After this call, src could be freed
449  * without affecting dest.
450  *
451  * Both paths should have the same root. Any non-root elements of dest
452  * will be freed.
453  */
454 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
455 {
456         int i;
457
458         BUG_ON(path_root_bh(dest) != path_root_bh(src));
459         BUG_ON(path_root_el(dest) != path_root_el(src));
460
461         ocfs2_reinit_path(dest, 1);
462
463         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
464                 dest->p_node[i].bh = src->p_node[i].bh;
465                 dest->p_node[i].el = src->p_node[i].el;
466
467                 if (dest->p_node[i].bh)
468                         get_bh(dest->p_node[i].bh);
469         }
470 }
471
472 /*
473  * Make the *dest path the same as src and re-initialize src path to
474  * have a root only.
475  */
476 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
477 {
478         int i;
479
480         BUG_ON(path_root_bh(dest) != path_root_bh(src));
481
482         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
483                 brelse(dest->p_node[i].bh);
484
485                 dest->p_node[i].bh = src->p_node[i].bh;
486                 dest->p_node[i].el = src->p_node[i].el;
487
488                 src->p_node[i].bh = NULL;
489                 src->p_node[i].el = NULL;
490         }
491 }
492
493 /*
494  * Insert an extent block at given index.
495  *
496  * This will not take an additional reference on eb_bh.
497  */
498 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
499                                         struct buffer_head *eb_bh)
500 {
501         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
502
503         /*
504          * Right now, no root bh is an extent block, so this helps
505          * catch code errors with dinode trees. The assertion can be
506          * safely removed if we ever need to insert extent block
507          * structures at the root.
508          */
509         BUG_ON(index == 0);
510
511         path->p_node[index].bh = eb_bh;
512         path->p_node[index].el = &eb->h_list;
513 }
514
515 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
516                                          struct ocfs2_extent_list *root_el)
517 {
518         struct ocfs2_path *path;
519
520         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
521
522         path = kzalloc(sizeof(*path), GFP_NOFS);
523         if (path) {
524                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
525                 get_bh(root_bh);
526                 path_root_bh(path) = root_bh;
527                 path_root_el(path) = root_el;
528         }
529
530         return path;
531 }
532
533 /*
534  * Convenience function to journal all components in a path.
535  */
536 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
537                                      struct ocfs2_path *path)
538 {
539         int i, ret = 0;
540
541         if (!path)
542                 goto out;
543
544         for(i = 0; i < path_num_items(path); i++) {
545                 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
546                                            OCFS2_JOURNAL_ACCESS_WRITE);
547                 if (ret < 0) {
548                         mlog_errno(ret);
549                         goto out;
550                 }
551         }
552
553 out:
554         return ret;
555 }
556
557 /*
558  * Return the index of the extent record which contains cluster #v_cluster.
559  * -1 is returned if it was not found.
560  *
561  * Should work fine on interior and exterior nodes.
562  */
563 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
564 {
565         int ret = -1;
566         int i;
567         struct ocfs2_extent_rec *rec;
568         u32 rec_end, rec_start, clusters;
569
570         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
571                 rec = &el->l_recs[i];
572
573                 rec_start = le32_to_cpu(rec->e_cpos);
574                 clusters = ocfs2_rec_clusters(el, rec);
575
576                 rec_end = rec_start + clusters;
577
578                 if (v_cluster >= rec_start && v_cluster < rec_end) {
579                         ret = i;
580                         break;
581                 }
582         }
583
584         return ret;
585 }
586
587 enum ocfs2_contig_type {
588         CONTIG_NONE = 0,
589         CONTIG_LEFT,
590         CONTIG_RIGHT,
591         CONTIG_LEFTRIGHT,
592 };
593
594
595 /*
596  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
597  * ocfs2_extent_contig only work properly against leaf nodes!
598  */
599 static int ocfs2_block_extent_contig(struct super_block *sb,
600                                      struct ocfs2_extent_rec *ext,
601                                      u64 blkno)
602 {
603         u64 blk_end = le64_to_cpu(ext->e_blkno);
604
605         blk_end += ocfs2_clusters_to_blocks(sb,
606                                     le16_to_cpu(ext->e_leaf_clusters));
607
608         return blkno == blk_end;
609 }
610
611 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
612                                   struct ocfs2_extent_rec *right)
613 {
614         u32 left_range;
615
616         left_range = le32_to_cpu(left->e_cpos) +
617                 le16_to_cpu(left->e_leaf_clusters);
618
619         return (left_range == le32_to_cpu(right->e_cpos));
620 }
621
622 static enum ocfs2_contig_type
623         ocfs2_extent_contig(struct inode *inode,
624                             struct ocfs2_extent_rec *ext,
625                             struct ocfs2_extent_rec *insert_rec)
626 {
627         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
628
629         /*
630          * Refuse to coalesce extent records with different flag
631          * fields - we don't want to mix unwritten extents with user
632          * data.
633          */
634         if (ext->e_flags != insert_rec->e_flags)
635                 return CONTIG_NONE;
636
637         if (ocfs2_extents_adjacent(ext, insert_rec) &&
638             ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
639                         return CONTIG_RIGHT;
640
641         blkno = le64_to_cpu(ext->e_blkno);
642         if (ocfs2_extents_adjacent(insert_rec, ext) &&
643             ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
644                 return CONTIG_LEFT;
645
646         return CONTIG_NONE;
647 }
648
649 /*
650  * NOTE: We can have pretty much any combination of contiguousness and
651  * appending.
652  *
653  * The usefulness of APPEND_TAIL is more in that it lets us know that
654  * we'll have to update the path to that leaf.
655  */
656 enum ocfs2_append_type {
657         APPEND_NONE = 0,
658         APPEND_TAIL,
659 };
660
661 enum ocfs2_split_type {
662         SPLIT_NONE = 0,
663         SPLIT_LEFT,
664         SPLIT_RIGHT,
665 };
666
667 struct ocfs2_insert_type {
668         enum ocfs2_split_type   ins_split;
669         enum ocfs2_append_type  ins_appending;
670         enum ocfs2_contig_type  ins_contig;
671         int                     ins_contig_index;
672         int                     ins_tree_depth;
673 };
674
675 struct ocfs2_merge_ctxt {
676         enum ocfs2_contig_type  c_contig_type;
677         int                     c_has_empty_extent;
678         int                     c_split_covers_rec;
679 };
680
681 /*
682  * How many free extents have we got before we need more meta data?
683  */
684 int ocfs2_num_free_extents(struct ocfs2_super *osb,
685                            struct inode *inode,
686                            struct ocfs2_extent_tree *et)
687 {
688         int retval;
689         struct ocfs2_extent_list *el = NULL;
690         struct ocfs2_extent_block *eb;
691         struct buffer_head *eb_bh = NULL;
692         u64 last_eb_blk = 0;
693
694         mlog_entry_void();
695
696         el = et->et_root_el;
697         last_eb_blk = ocfs2_et_get_last_eb_blk(et);
698
699         if (last_eb_blk) {
700                 retval = ocfs2_read_block(inode, last_eb_blk,
701                                           &eb_bh);
702                 if (retval < 0) {
703                         mlog_errno(retval);
704                         goto bail;
705                 }
706                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
707                 el = &eb->h_list;
708         }
709
710         BUG_ON(el->l_tree_depth != 0);
711
712         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
713 bail:
714         brelse(eb_bh);
715
716         mlog_exit(retval);
717         return retval;
718 }
719
720 /* expects array to already be allocated
721  *
722  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
723  * l_count for you
724  */
725 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
726                                      handle_t *handle,
727                                      struct inode *inode,
728                                      int wanted,
729                                      struct ocfs2_alloc_context *meta_ac,
730                                      struct buffer_head *bhs[])
731 {
732         int count, status, i;
733         u16 suballoc_bit_start;
734         u32 num_got;
735         u64 first_blkno;
736         struct ocfs2_extent_block *eb;
737
738         mlog_entry_void();
739
740         count = 0;
741         while (count < wanted) {
742                 status = ocfs2_claim_metadata(osb,
743                                               handle,
744                                               meta_ac,
745                                               wanted - count,
746                                               &suballoc_bit_start,
747                                               &num_got,
748                                               &first_blkno);
749                 if (status < 0) {
750                         mlog_errno(status);
751                         goto bail;
752                 }
753
754                 for(i = count;  i < (num_got + count); i++) {
755                         bhs[i] = sb_getblk(osb->sb, first_blkno);
756                         if (bhs[i] == NULL) {
757                                 status = -EIO;
758                                 mlog_errno(status);
759                                 goto bail;
760                         }
761                         ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
762
763                         status = ocfs2_journal_access(handle, inode, bhs[i],
764                                                       OCFS2_JOURNAL_ACCESS_CREATE);
765                         if (status < 0) {
766                                 mlog_errno(status);
767                                 goto bail;
768                         }
769
770                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
771                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
772                         /* Ok, setup the minimal stuff here. */
773                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
774                         eb->h_blkno = cpu_to_le64(first_blkno);
775                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
776                         eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
777                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
778                         eb->h_list.l_count =
779                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
780
781                         suballoc_bit_start++;
782                         first_blkno++;
783
784                         /* We'll also be dirtied by the caller, so
785                          * this isn't absolutely necessary. */
786                         status = ocfs2_journal_dirty(handle, bhs[i]);
787                         if (status < 0) {
788                                 mlog_errno(status);
789                                 goto bail;
790                         }
791                 }
792
793                 count += num_got;
794         }
795
796         status = 0;
797 bail:
798         if (status < 0) {
799                 for(i = 0; i < wanted; i++) {
800                         brelse(bhs[i]);
801                         bhs[i] = NULL;
802                 }
803         }
804         mlog_exit(status);
805         return status;
806 }
807
808 /*
809  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
810  *
811  * Returns the sum of the rightmost extent rec logical offset and
812  * cluster count.
813  *
814  * ocfs2_add_branch() uses this to determine what logical cluster
815  * value should be populated into the leftmost new branch records.
816  *
817  * ocfs2_shift_tree_depth() uses this to determine the # clusters
818  * value for the new topmost tree record.
819  */
820 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
821 {
822         int i;
823
824         i = le16_to_cpu(el->l_next_free_rec) - 1;
825
826         return le32_to_cpu(el->l_recs[i].e_cpos) +
827                 ocfs2_rec_clusters(el, &el->l_recs[i]);
828 }
829
830 /*
831  * Add an entire tree branch to our inode. eb_bh is the extent block
832  * to start at, if we don't want to start the branch at the dinode
833  * structure.
834  *
835  * last_eb_bh is required as we have to update it's next_leaf pointer
836  * for the new last extent block.
837  *
838  * the new branch will be 'empty' in the sense that every block will
839  * contain a single record with cluster count == 0.
840  */
841 static int ocfs2_add_branch(struct ocfs2_super *osb,
842                             handle_t *handle,
843                             struct inode *inode,
844                             struct ocfs2_extent_tree *et,
845                             struct buffer_head *eb_bh,
846                             struct buffer_head **last_eb_bh,
847                             struct ocfs2_alloc_context *meta_ac)
848 {
849         int status, new_blocks, i;
850         u64 next_blkno, new_last_eb_blk;
851         struct buffer_head *bh;
852         struct buffer_head **new_eb_bhs = NULL;
853         struct ocfs2_extent_block *eb;
854         struct ocfs2_extent_list  *eb_el;
855         struct ocfs2_extent_list  *el;
856         u32 new_cpos;
857
858         mlog_entry_void();
859
860         BUG_ON(!last_eb_bh || !*last_eb_bh);
861
862         if (eb_bh) {
863                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
864                 el = &eb->h_list;
865         } else
866                 el = et->et_root_el;
867
868         /* we never add a branch to a leaf. */
869         BUG_ON(!el->l_tree_depth);
870
871         new_blocks = le16_to_cpu(el->l_tree_depth);
872
873         /* allocate the number of new eb blocks we need */
874         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
875                              GFP_KERNEL);
876         if (!new_eb_bhs) {
877                 status = -ENOMEM;
878                 mlog_errno(status);
879                 goto bail;
880         }
881
882         status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
883                                            meta_ac, new_eb_bhs);
884         if (status < 0) {
885                 mlog_errno(status);
886                 goto bail;
887         }
888
889         eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
890         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
891
892         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
893          * linked with the rest of the tree.
894          * conversly, new_eb_bhs[0] is the new bottommost leaf.
895          *
896          * when we leave the loop, new_last_eb_blk will point to the
897          * newest leaf, and next_blkno will point to the topmost extent
898          * block. */
899         next_blkno = new_last_eb_blk = 0;
900         for(i = 0; i < new_blocks; i++) {
901                 bh = new_eb_bhs[i];
902                 eb = (struct ocfs2_extent_block *) bh->b_data;
903                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
904                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
905                         status = -EIO;
906                         goto bail;
907                 }
908                 eb_el = &eb->h_list;
909
910                 status = ocfs2_journal_access(handle, inode, bh,
911                                               OCFS2_JOURNAL_ACCESS_CREATE);
912                 if (status < 0) {
913                         mlog_errno(status);
914                         goto bail;
915                 }
916
917                 eb->h_next_leaf_blk = 0;
918                 eb_el->l_tree_depth = cpu_to_le16(i);
919                 eb_el->l_next_free_rec = cpu_to_le16(1);
920                 /*
921                  * This actually counts as an empty extent as
922                  * c_clusters == 0
923                  */
924                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
925                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
926                 /*
927                  * eb_el isn't always an interior node, but even leaf
928                  * nodes want a zero'd flags and reserved field so
929                  * this gets the whole 32 bits regardless of use.
930                  */
931                 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
932                 if (!eb_el->l_tree_depth)
933                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
934
935                 status = ocfs2_journal_dirty(handle, bh);
936                 if (status < 0) {
937                         mlog_errno(status);
938                         goto bail;
939                 }
940
941                 next_blkno = le64_to_cpu(eb->h_blkno);
942         }
943
944         /* This is a bit hairy. We want to update up to three blocks
945          * here without leaving any of them in an inconsistent state
946          * in case of error. We don't have to worry about
947          * journal_dirty erroring as it won't unless we've aborted the
948          * handle (in which case we would never be here) so reserving
949          * the write with journal_access is all we need to do. */
950         status = ocfs2_journal_access(handle, inode, *last_eb_bh,
951                                       OCFS2_JOURNAL_ACCESS_WRITE);
952         if (status < 0) {
953                 mlog_errno(status);
954                 goto bail;
955         }
956         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
957                                       OCFS2_JOURNAL_ACCESS_WRITE);
958         if (status < 0) {
959                 mlog_errno(status);
960                 goto bail;
961         }
962         if (eb_bh) {
963                 status = ocfs2_journal_access(handle, inode, eb_bh,
964                                               OCFS2_JOURNAL_ACCESS_WRITE);
965                 if (status < 0) {
966                         mlog_errno(status);
967                         goto bail;
968                 }
969         }
970
971         /* Link the new branch into the rest of the tree (el will
972          * either be on the root_bh, or the extent block passed in. */
973         i = le16_to_cpu(el->l_next_free_rec);
974         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
975         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
976         el->l_recs[i].e_int_clusters = 0;
977         le16_add_cpu(&el->l_next_free_rec, 1);
978
979         /* fe needs a new last extent block pointer, as does the
980          * next_leaf on the previously last-extent-block. */
981         ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
982
983         eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
984         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
985
986         status = ocfs2_journal_dirty(handle, *last_eb_bh);
987         if (status < 0)
988                 mlog_errno(status);
989         status = ocfs2_journal_dirty(handle, et->et_root_bh);
990         if (status < 0)
991                 mlog_errno(status);
992         if (eb_bh) {
993                 status = ocfs2_journal_dirty(handle, eb_bh);
994                 if (status < 0)
995                         mlog_errno(status);
996         }
997
998         /*
999          * Some callers want to track the rightmost leaf so pass it
1000          * back here.
1001          */
1002         brelse(*last_eb_bh);
1003         get_bh(new_eb_bhs[0]);
1004         *last_eb_bh = new_eb_bhs[0];
1005
1006         status = 0;
1007 bail:
1008         if (new_eb_bhs) {
1009                 for (i = 0; i < new_blocks; i++)
1010                         brelse(new_eb_bhs[i]);
1011                 kfree(new_eb_bhs);
1012         }
1013
1014         mlog_exit(status);
1015         return status;
1016 }
1017
1018 /*
1019  * adds another level to the allocation tree.
1020  * returns back the new extent block so you can add a branch to it
1021  * after this call.
1022  */
1023 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1024                                   handle_t *handle,
1025                                   struct inode *inode,
1026                                   struct ocfs2_extent_tree *et,
1027                                   struct ocfs2_alloc_context *meta_ac,
1028                                   struct buffer_head **ret_new_eb_bh)
1029 {
1030         int status, i;
1031         u32 new_clusters;
1032         struct buffer_head *new_eb_bh = NULL;
1033         struct ocfs2_extent_block *eb;
1034         struct ocfs2_extent_list  *root_el;
1035         struct ocfs2_extent_list  *eb_el;
1036
1037         mlog_entry_void();
1038
1039         status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
1040                                            &new_eb_bh);
1041         if (status < 0) {
1042                 mlog_errno(status);
1043                 goto bail;
1044         }
1045
1046         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1047         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1048                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1049                 status = -EIO;
1050                 goto bail;
1051         }
1052
1053         eb_el = &eb->h_list;
1054         root_el = et->et_root_el;
1055
1056         status = ocfs2_journal_access(handle, inode, new_eb_bh,
1057                                       OCFS2_JOURNAL_ACCESS_CREATE);
1058         if (status < 0) {
1059                 mlog_errno(status);
1060                 goto bail;
1061         }
1062
1063         /* copy the root extent list data into the new extent block */
1064         eb_el->l_tree_depth = root_el->l_tree_depth;
1065         eb_el->l_next_free_rec = root_el->l_next_free_rec;
1066         for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1067                 eb_el->l_recs[i] = root_el->l_recs[i];
1068
1069         status = ocfs2_journal_dirty(handle, new_eb_bh);
1070         if (status < 0) {
1071                 mlog_errno(status);
1072                 goto bail;
1073         }
1074
1075         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
1076                                       OCFS2_JOURNAL_ACCESS_WRITE);
1077         if (status < 0) {
1078                 mlog_errno(status);
1079                 goto bail;
1080         }
1081
1082         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1083
1084         /* update root_bh now */
1085         le16_add_cpu(&root_el->l_tree_depth, 1);
1086         root_el->l_recs[0].e_cpos = 0;
1087         root_el->l_recs[0].e_blkno = eb->h_blkno;
1088         root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1089         for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1090                 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1091         root_el->l_next_free_rec = cpu_to_le16(1);
1092
1093         /* If this is our 1st tree depth shift, then last_eb_blk
1094          * becomes the allocated extent block */
1095         if (root_el->l_tree_depth == cpu_to_le16(1))
1096                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1097
1098         status = ocfs2_journal_dirty(handle, et->et_root_bh);
1099         if (status < 0) {
1100                 mlog_errno(status);
1101                 goto bail;
1102         }
1103
1104         *ret_new_eb_bh = new_eb_bh;
1105         new_eb_bh = NULL;
1106         status = 0;
1107 bail:
1108         brelse(new_eb_bh);
1109
1110         mlog_exit(status);
1111         return status;
1112 }
1113
1114 /*
1115  * Should only be called when there is no space left in any of the
1116  * leaf nodes. What we want to do is find the lowest tree depth
1117  * non-leaf extent block with room for new records. There are three
1118  * valid results of this search:
1119  *
1120  * 1) a lowest extent block is found, then we pass it back in
1121  *    *lowest_eb_bh and return '0'
1122  *
1123  * 2) the search fails to find anything, but the root_el has room. We
1124  *    pass NULL back in *lowest_eb_bh, but still return '0'
1125  *
1126  * 3) the search fails to find anything AND the root_el is full, in
1127  *    which case we return > 0
1128  *
1129  * return status < 0 indicates an error.
1130  */
1131 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1132                                     struct inode *inode,
1133                                     struct ocfs2_extent_tree *et,
1134                                     struct buffer_head **target_bh)
1135 {
1136         int status = 0, i;
1137         u64 blkno;
1138         struct ocfs2_extent_block *eb;
1139         struct ocfs2_extent_list  *el;
1140         struct buffer_head *bh = NULL;
1141         struct buffer_head *lowest_bh = NULL;
1142
1143         mlog_entry_void();
1144
1145         *target_bh = NULL;
1146
1147         el = et->et_root_el;
1148
1149         while(le16_to_cpu(el->l_tree_depth) > 1) {
1150                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1151                         ocfs2_error(inode->i_sb, "Dinode %llu has empty "
1152                                     "extent list (next_free_rec == 0)",
1153                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
1154                         status = -EIO;
1155                         goto bail;
1156                 }
1157                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1158                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1159                 if (!blkno) {
1160                         ocfs2_error(inode->i_sb, "Dinode %llu has extent "
1161                                     "list where extent # %d has no physical "
1162                                     "block start",
1163                                     (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
1164                         status = -EIO;
1165                         goto bail;
1166                 }
1167
1168                 brelse(bh);
1169                 bh = NULL;
1170
1171                 status = ocfs2_read_block(inode, blkno, &bh);
1172                 if (status < 0) {
1173                         mlog_errno(status);
1174                         goto bail;
1175                 }
1176
1177                 eb = (struct ocfs2_extent_block *) bh->b_data;
1178                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1179                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1180                         status = -EIO;
1181                         goto bail;
1182                 }
1183                 el = &eb->h_list;
1184
1185                 if (le16_to_cpu(el->l_next_free_rec) <
1186                     le16_to_cpu(el->l_count)) {
1187                         brelse(lowest_bh);
1188                         lowest_bh = bh;
1189                         get_bh(lowest_bh);
1190                 }
1191         }
1192
1193         /* If we didn't find one and the fe doesn't have any room,
1194          * then return '1' */
1195         el = et->et_root_el;
1196         if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1197                 status = 1;
1198
1199         *target_bh = lowest_bh;
1200 bail:
1201         brelse(bh);
1202
1203         mlog_exit(status);
1204         return status;
1205 }
1206
1207 /*
1208  * Grow a b-tree so that it has more records.
1209  *
1210  * We might shift the tree depth in which case existing paths should
1211  * be considered invalid.
1212  *
1213  * Tree depth after the grow is returned via *final_depth.
1214  *
1215  * *last_eb_bh will be updated by ocfs2_add_branch().
1216  */
1217 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1218                            struct ocfs2_extent_tree *et, int *final_depth,
1219                            struct buffer_head **last_eb_bh,
1220                            struct ocfs2_alloc_context *meta_ac)
1221 {
1222         int ret, shift;
1223         struct ocfs2_extent_list *el = et->et_root_el;
1224         int depth = le16_to_cpu(el->l_tree_depth);
1225         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1226         struct buffer_head *bh = NULL;
1227
1228         BUG_ON(meta_ac == NULL);
1229
1230         shift = ocfs2_find_branch_target(osb, inode, et, &bh);
1231         if (shift < 0) {
1232                 ret = shift;
1233                 mlog_errno(ret);
1234                 goto out;
1235         }
1236
1237         /* We traveled all the way to the bottom of the allocation tree
1238          * and didn't find room for any more extents - we need to add
1239          * another tree level */
1240         if (shift) {
1241                 BUG_ON(bh);
1242                 mlog(0, "need to shift tree depth (current = %d)\n", depth);
1243
1244                 /* ocfs2_shift_tree_depth will return us a buffer with
1245                  * the new extent block (so we can pass that to
1246                  * ocfs2_add_branch). */
1247                 ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
1248                                              meta_ac, &bh);
1249                 if (ret < 0) {
1250                         mlog_errno(ret);
1251                         goto out;
1252                 }
1253                 depth++;
1254                 if (depth == 1) {
1255                         /*
1256                          * Special case: we have room now if we shifted from
1257                          * tree_depth 0, so no more work needs to be done.
1258                          *
1259                          * We won't be calling add_branch, so pass
1260                          * back *last_eb_bh as the new leaf. At depth
1261                          * zero, it should always be null so there's
1262                          * no reason to brelse.
1263                          */
1264                         BUG_ON(*last_eb_bh);
1265                         get_bh(bh);
1266                         *last_eb_bh = bh;
1267                         goto out;
1268                 }
1269         }
1270
1271         /* call ocfs2_add_branch to add the final part of the tree with
1272          * the new data. */
1273         mlog(0, "add branch. bh = %p\n", bh);
1274         ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
1275                                meta_ac);
1276         if (ret < 0) {
1277                 mlog_errno(ret);
1278                 goto out;
1279         }
1280
1281 out:
1282         if (final_depth)
1283                 *final_depth = depth;
1284         brelse(bh);
1285         return ret;
1286 }
1287
1288 /*
1289  * This function will discard the rightmost extent record.
1290  */
1291 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1292 {
1293         int next_free = le16_to_cpu(el->l_next_free_rec);
1294         int count = le16_to_cpu(el->l_count);
1295         unsigned int num_bytes;
1296
1297         BUG_ON(!next_free);
1298         /* This will cause us to go off the end of our extent list. */
1299         BUG_ON(next_free >= count);
1300
1301         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1302
1303         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1304 }
1305
1306 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1307                               struct ocfs2_extent_rec *insert_rec)
1308 {
1309         int i, insert_index, next_free, has_empty, num_bytes;
1310         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1311         struct ocfs2_extent_rec *rec;
1312
1313         next_free = le16_to_cpu(el->l_next_free_rec);
1314         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1315
1316         BUG_ON(!next_free);
1317
1318         /* The tree code before us didn't allow enough room in the leaf. */
1319         BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1320
1321         /*
1322          * The easiest way to approach this is to just remove the
1323          * empty extent and temporarily decrement next_free.
1324          */
1325         if (has_empty) {
1326                 /*
1327                  * If next_free was 1 (only an empty extent), this
1328                  * loop won't execute, which is fine. We still want
1329                  * the decrement above to happen.
1330                  */
1331                 for(i = 0; i < (next_free - 1); i++)
1332                         el->l_recs[i] = el->l_recs[i+1];
1333
1334                 next_free--;
1335         }
1336
1337         /*
1338          * Figure out what the new record index should be.
1339          */
1340         for(i = 0; i < next_free; i++) {
1341                 rec = &el->l_recs[i];
1342
1343                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
1344                         break;
1345         }
1346         insert_index = i;
1347
1348         mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1349              insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1350
1351         BUG_ON(insert_index < 0);
1352         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1353         BUG_ON(insert_index > next_free);
1354
1355         /*
1356          * No need to memmove if we're just adding to the tail.
1357          */
1358         if (insert_index != next_free) {
1359                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
1360
1361                 num_bytes = next_free - insert_index;
1362                 num_bytes *= sizeof(struct ocfs2_extent_rec);
1363                 memmove(&el->l_recs[insert_index + 1],
1364                         &el->l_recs[insert_index],
1365                         num_bytes);
1366         }
1367
1368         /*
1369          * Either we had an empty extent, and need to re-increment or
1370          * there was no empty extent on a non full rightmost leaf node,
1371          * in which case we still need to increment.
1372          */
1373         next_free++;
1374         el->l_next_free_rec = cpu_to_le16(next_free);
1375         /*
1376          * Make sure none of the math above just messed up our tree.
1377          */
1378         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1379
1380         el->l_recs[insert_index] = *insert_rec;
1381
1382 }
1383
1384 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1385 {
1386         int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1387
1388         BUG_ON(num_recs == 0);
1389
1390         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1391                 num_recs--;
1392                 size = num_recs * sizeof(struct ocfs2_extent_rec);
1393                 memmove(&el->l_recs[0], &el->l_recs[1], size);
1394                 memset(&el->l_recs[num_recs], 0,
1395                        sizeof(struct ocfs2_extent_rec));
1396                 el->l_next_free_rec = cpu_to_le16(num_recs);
1397         }
1398 }
1399
1400 /*
1401  * Create an empty extent record .
1402  *
1403  * l_next_free_rec may be updated.
1404  *
1405  * If an empty extent already exists do nothing.
1406  */
1407 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1408 {
1409         int next_free = le16_to_cpu(el->l_next_free_rec);
1410
1411         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1412
1413         if (next_free == 0)
1414                 goto set_and_inc;
1415
1416         if (ocfs2_is_empty_extent(&el->l_recs[0]))
1417                 return;
1418
1419         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1420                         "Asked to create an empty extent in a full list:\n"
1421                         "count = %u, tree depth = %u",
1422                         le16_to_cpu(el->l_count),
1423                         le16_to_cpu(el->l_tree_depth));
1424
1425         ocfs2_shift_records_right(el);
1426
1427 set_and_inc:
1428         le16_add_cpu(&el->l_next_free_rec, 1);
1429         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1430 }
1431
1432 /*
1433  * For a rotation which involves two leaf nodes, the "root node" is
1434  * the lowest level tree node which contains a path to both leafs. This
1435  * resulting set of information can be used to form a complete "subtree"
1436  *
1437  * This function is passed two full paths from the dinode down to a
1438  * pair of adjacent leaves. It's task is to figure out which path
1439  * index contains the subtree root - this can be the root index itself
1440  * in a worst-case rotation.
1441  *
1442  * The array index of the subtree root is passed back.
1443  */
1444 static int ocfs2_find_subtree_root(struct inode *inode,
1445                                    struct ocfs2_path *left,
1446                                    struct ocfs2_path *right)
1447 {
1448         int i = 0;
1449
1450         /*
1451          * Check that the caller passed in two paths from the same tree.
1452          */
1453         BUG_ON(path_root_bh(left) != path_root_bh(right));
1454
1455         do {
1456                 i++;
1457
1458                 /*
1459                  * The caller didn't pass two adjacent paths.
1460                  */
1461                 mlog_bug_on_msg(i > left->p_tree_depth,
1462                                 "Inode %lu, left depth %u, right depth %u\n"
1463                                 "left leaf blk %llu, right leaf blk %llu\n",
1464                                 inode->i_ino, left->p_tree_depth,
1465                                 right->p_tree_depth,
1466                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1467                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1468         } while (left->p_node[i].bh->b_blocknr ==
1469                  right->p_node[i].bh->b_blocknr);
1470
1471         return i - 1;
1472 }
1473
1474 typedef void (path_insert_t)(void *, struct buffer_head *);
1475
1476 /*
1477  * Traverse a btree path in search of cpos, starting at root_el.
1478  *
1479  * This code can be called with a cpos larger than the tree, in which
1480  * case it will return the rightmost path.
1481  */
1482 static int __ocfs2_find_path(struct inode *inode,
1483                              struct ocfs2_extent_list *root_el, u32 cpos,
1484                              path_insert_t *func, void *data)
1485 {
1486         int i, ret = 0;
1487         u32 range;
1488         u64 blkno;
1489         struct buffer_head *bh = NULL;
1490         struct ocfs2_extent_block *eb;
1491         struct ocfs2_extent_list *el;
1492         struct ocfs2_extent_rec *rec;
1493         struct ocfs2_inode_info *oi = OCFS2_I(inode);
1494
1495         el = root_el;
1496         while (el->l_tree_depth) {
1497                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1498                         ocfs2_error(inode->i_sb,
1499                                     "Inode %llu has empty extent list at "
1500                                     "depth %u\n",
1501                                     (unsigned long long)oi->ip_blkno,
1502                                     le16_to_cpu(el->l_tree_depth));
1503                         ret = -EROFS;
1504                         goto out;
1505
1506                 }
1507
1508                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1509                         rec = &el->l_recs[i];
1510
1511                         /*
1512                          * In the case that cpos is off the allocation
1513                          * tree, this should just wind up returning the
1514                          * rightmost record.
1515                          */
1516                         range = le32_to_cpu(rec->e_cpos) +
1517                                 ocfs2_rec_clusters(el, rec);
1518                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1519                             break;
1520                 }
1521
1522                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1523                 if (blkno == 0) {
1524                         ocfs2_error(inode->i_sb,
1525                                     "Inode %llu has bad blkno in extent list "
1526                                     "at depth %u (index %d)\n",
1527                                     (unsigned long long)oi->ip_blkno,
1528                                     le16_to_cpu(el->l_tree_depth), i);
1529                         ret = -EROFS;
1530                         goto out;
1531                 }
1532
1533                 brelse(bh);
1534                 bh = NULL;
1535                 ret = ocfs2_read_block(inode, blkno, &bh);
1536                 if (ret) {
1537                         mlog_errno(ret);
1538                         goto out;
1539                 }
1540
1541                 eb = (struct ocfs2_extent_block *) bh->b_data;
1542                 el = &eb->h_list;
1543                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1544                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1545                         ret = -EIO;
1546                         goto out;
1547                 }
1548
1549                 if (le16_to_cpu(el->l_next_free_rec) >
1550                     le16_to_cpu(el->l_count)) {
1551                         ocfs2_error(inode->i_sb,
1552                                     "Inode %llu has bad count in extent list "
1553                                     "at block %llu (next free=%u, count=%u)\n",
1554                                     (unsigned long long)oi->ip_blkno,
1555                                     (unsigned long long)bh->b_blocknr,
1556                                     le16_to_cpu(el->l_next_free_rec),
1557                                     le16_to_cpu(el->l_count));
1558                         ret = -EROFS;
1559                         goto out;
1560                 }
1561
1562                 if (func)
1563                         func(data, bh);
1564         }
1565
1566 out:
1567         /*
1568          * Catch any trailing bh that the loop didn't handle.
1569          */
1570         brelse(bh);
1571
1572         return ret;
1573 }
1574
1575 /*
1576  * Given an initialized path (that is, it has a valid root extent
1577  * list), this function will traverse the btree in search of the path
1578  * which would contain cpos.
1579  *
1580  * The path traveled is recorded in the path structure.
1581  *
1582  * Note that this will not do any comparisons on leaf node extent
1583  * records, so it will work fine in the case that we just added a tree
1584  * branch.
1585  */
1586 struct find_path_data {
1587         int index;
1588         struct ocfs2_path *path;
1589 };
1590 static void find_path_ins(void *data, struct buffer_head *bh)
1591 {
1592         struct find_path_data *fp = data;
1593
1594         get_bh(bh);
1595         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1596         fp->index++;
1597 }
1598 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1599                            u32 cpos)
1600 {
1601         struct find_path_data data;
1602
1603         data.index = 1;
1604         data.path = path;
1605         return __ocfs2_find_path(inode, path_root_el(path), cpos,
1606                                  find_path_ins, &data);
1607 }
1608
1609 static void find_leaf_ins(void *data, struct buffer_head *bh)
1610 {
1611         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1612         struct ocfs2_extent_list *el = &eb->h_list;
1613         struct buffer_head **ret = data;
1614
1615         /* We want to retain only the leaf block. */
1616         if (le16_to_cpu(el->l_tree_depth) == 0) {
1617                 get_bh(bh);
1618                 *ret = bh;
1619         }
1620 }
1621 /*
1622  * Find the leaf block in the tree which would contain cpos. No
1623  * checking of the actual leaf is done.
1624  *
1625  * Some paths want to call this instead of allocating a path structure
1626  * and calling ocfs2_find_path().
1627  *
1628  * This function doesn't handle non btree extent lists.
1629  */
1630 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1631                     u32 cpos, struct buffer_head **leaf_bh)
1632 {
1633         int ret;
1634         struct buffer_head *bh = NULL;
1635
1636         ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1637         if (ret) {
1638                 mlog_errno(ret);
1639                 goto out;
1640         }
1641
1642         *leaf_bh = bh;
1643 out:
1644         return ret;
1645 }
1646
1647 /*
1648  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1649  *
1650  * Basically, we've moved stuff around at the bottom of the tree and
1651  * we need to fix up the extent records above the changes to reflect
1652  * the new changes.
1653  *
1654  * left_rec: the record on the left.
1655  * left_child_el: is the child list pointed to by left_rec
1656  * right_rec: the record to the right of left_rec
1657  * right_child_el: is the child list pointed to by right_rec
1658  *
1659  * By definition, this only works on interior nodes.
1660  */
1661 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1662                                   struct ocfs2_extent_list *left_child_el,
1663                                   struct ocfs2_extent_rec *right_rec,
1664                                   struct ocfs2_extent_list *right_child_el)
1665 {
1666         u32 left_clusters, right_end;
1667
1668         /*
1669          * Interior nodes never have holes. Their cpos is the cpos of
1670          * the leftmost record in their child list. Their cluster
1671          * count covers the full theoretical range of their child list
1672          * - the range between their cpos and the cpos of the record
1673          * immediately to their right.
1674          */
1675         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1676         if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1677                 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1678                 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1679         }
1680         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1681         left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1682
1683         /*
1684          * Calculate the rightmost cluster count boundary before
1685          * moving cpos - we will need to adjust clusters after
1686          * updating e_cpos to keep the same highest cluster count.
1687          */
1688         right_end = le32_to_cpu(right_rec->e_cpos);
1689         right_end += le32_to_cpu(right_rec->e_int_clusters);
1690
1691         right_rec->e_cpos = left_rec->e_cpos;
1692         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1693
1694         right_end -= le32_to_cpu(right_rec->e_cpos);
1695         right_rec->e_int_clusters = cpu_to_le32(right_end);
1696 }
1697
1698 /*
1699  * Adjust the adjacent root node records involved in a
1700  * rotation. left_el_blkno is passed in as a key so that we can easily
1701  * find it's index in the root list.
1702  */
1703 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1704                                       struct ocfs2_extent_list *left_el,
1705                                       struct ocfs2_extent_list *right_el,
1706                                       u64 left_el_blkno)
1707 {
1708         int i;
1709
1710         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1711                le16_to_cpu(left_el->l_tree_depth));
1712
1713         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1714                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1715                         break;
1716         }
1717
1718         /*
1719          * The path walking code should have never returned a root and
1720          * two paths which are not adjacent.
1721          */
1722         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1723
1724         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1725                                       &root_el->l_recs[i + 1], right_el);
1726 }
1727
1728 /*
1729  * We've changed a leaf block (in right_path) and need to reflect that
1730  * change back up the subtree.
1731  *
1732  * This happens in multiple places:
1733  *   - When we've moved an extent record from the left path leaf to the right
1734  *     path leaf to make room for an empty extent in the left path leaf.
1735  *   - When our insert into the right path leaf is at the leftmost edge
1736  *     and requires an update of the path immediately to it's left. This
1737  *     can occur at the end of some types of rotation and appending inserts.
1738  *   - When we've adjusted the last extent record in the left path leaf and the
1739  *     1st extent record in the right path leaf during cross extent block merge.
1740  */
1741 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1742                                        struct ocfs2_path *left_path,
1743                                        struct ocfs2_path *right_path,
1744                                        int subtree_index)
1745 {
1746         int ret, i, idx;
1747         struct ocfs2_extent_list *el, *left_el, *right_el;
1748         struct ocfs2_extent_rec *left_rec, *right_rec;
1749         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1750
1751         /*
1752          * Update the counts and position values within all the
1753          * interior nodes to reflect the leaf rotation we just did.
1754          *
1755          * The root node is handled below the loop.
1756          *
1757          * We begin the loop with right_el and left_el pointing to the
1758          * leaf lists and work our way up.
1759          *
1760          * NOTE: within this loop, left_el and right_el always refer
1761          * to the *child* lists.
1762          */
1763         left_el = path_leaf_el(left_path);
1764         right_el = path_leaf_el(right_path);
1765         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1766                 mlog(0, "Adjust records at index %u\n", i);
1767
1768                 /*
1769                  * One nice property of knowing that all of these
1770                  * nodes are below the root is that we only deal with
1771                  * the leftmost right node record and the rightmost
1772                  * left node record.
1773                  */
1774                 el = left_path->p_node[i].el;
1775                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1776                 left_rec = &el->l_recs[idx];
1777
1778                 el = right_path->p_node[i].el;
1779                 right_rec = &el->l_recs[0];
1780
1781                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1782                                               right_el);
1783
1784                 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1785                 if (ret)
1786                         mlog_errno(ret);
1787
1788                 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1789                 if (ret)
1790                         mlog_errno(ret);
1791
1792                 /*
1793                  * Setup our list pointers now so that the current
1794                  * parents become children in the next iteration.
1795                  */
1796                 left_el = left_path->p_node[i].el;
1797                 right_el = right_path->p_node[i].el;
1798         }
1799
1800         /*
1801          * At the root node, adjust the two adjacent records which
1802          * begin our path to the leaves.
1803          */
1804
1805         el = left_path->p_node[subtree_index].el;
1806         left_el = left_path->p_node[subtree_index + 1].el;
1807         right_el = right_path->p_node[subtree_index + 1].el;
1808
1809         ocfs2_adjust_root_records(el, left_el, right_el,
1810                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
1811
1812         root_bh = left_path->p_node[subtree_index].bh;
1813
1814         ret = ocfs2_journal_dirty(handle, root_bh);
1815         if (ret)
1816                 mlog_errno(ret);
1817 }
1818
1819 static int ocfs2_rotate_subtree_right(struct inode *inode,
1820                                       handle_t *handle,
1821                                       struct ocfs2_path *left_path,
1822                                       struct ocfs2_path *right_path,
1823                                       int subtree_index)
1824 {
1825         int ret, i;
1826         struct buffer_head *right_leaf_bh;
1827         struct buffer_head *left_leaf_bh = NULL;
1828         struct buffer_head *root_bh;
1829         struct ocfs2_extent_list *right_el, *left_el;
1830         struct ocfs2_extent_rec move_rec;
1831
1832         left_leaf_bh = path_leaf_bh(left_path);
1833         left_el = path_leaf_el(left_path);
1834
1835         if (left_el->l_next_free_rec != left_el->l_count) {
1836                 ocfs2_error(inode->i_sb,
1837                             "Inode %llu has non-full interior leaf node %llu"
1838                             "(next free = %u)",
1839                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1840                             (unsigned long long)left_leaf_bh->b_blocknr,
1841                             le16_to_cpu(left_el->l_next_free_rec));
1842                 return -EROFS;
1843         }
1844
1845         /*
1846          * This extent block may already have an empty record, so we
1847          * return early if so.
1848          */
1849         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1850                 return 0;
1851
1852         root_bh = left_path->p_node[subtree_index].bh;
1853         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1854
1855         ret = ocfs2_journal_access(handle, inode, root_bh,
1856                                    OCFS2_JOURNAL_ACCESS_WRITE);
1857         if (ret) {
1858                 mlog_errno(ret);
1859                 goto out;
1860         }
1861
1862         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1863                 ret = ocfs2_journal_access(handle, inode,
1864                                            right_path->p_node[i].bh,
1865                                            OCFS2_JOURNAL_ACCESS_WRITE);
1866                 if (ret) {
1867                         mlog_errno(ret);
1868                         goto out;
1869                 }
1870
1871                 ret = ocfs2_journal_access(handle, inode,
1872                                            left_path->p_node[i].bh,
1873                                            OCFS2_JOURNAL_ACCESS_WRITE);
1874                 if (ret) {
1875                         mlog_errno(ret);
1876                         goto out;
1877                 }
1878         }
1879
1880         right_leaf_bh = path_leaf_bh(right_path);
1881         right_el = path_leaf_el(right_path);
1882
1883         /* This is a code error, not a disk corruption. */
1884         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1885                         "because rightmost leaf block %llu is empty\n",
1886                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1887                         (unsigned long long)right_leaf_bh->b_blocknr);
1888
1889         ocfs2_create_empty_extent(right_el);
1890
1891         ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1892         if (ret) {
1893                 mlog_errno(ret);
1894                 goto out;
1895         }
1896
1897         /* Do the copy now. */
1898         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1899         move_rec = left_el->l_recs[i];
1900         right_el->l_recs[0] = move_rec;
1901
1902         /*
1903          * Clear out the record we just copied and shift everything
1904          * over, leaving an empty extent in the left leaf.
1905          *
1906          * We temporarily subtract from next_free_rec so that the
1907          * shift will lose the tail record (which is now defunct).
1908          */
1909         le16_add_cpu(&left_el->l_next_free_rec, -1);
1910         ocfs2_shift_records_right(left_el);
1911         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1912         le16_add_cpu(&left_el->l_next_free_rec, 1);
1913
1914         ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1915         if (ret) {
1916                 mlog_errno(ret);
1917                 goto out;
1918         }
1919
1920         ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1921                                 subtree_index);
1922
1923 out:
1924         return ret;
1925 }
1926
1927 /*
1928  * Given a full path, determine what cpos value would return us a path
1929  * containing the leaf immediately to the left of the current one.
1930  *
1931  * Will return zero if the path passed in is already the leftmost path.
1932  */
1933 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1934                                          struct ocfs2_path *path, u32 *cpos)
1935 {
1936         int i, j, ret = 0;
1937         u64 blkno;
1938         struct ocfs2_extent_list *el;
1939
1940         BUG_ON(path->p_tree_depth == 0);
1941
1942         *cpos = 0;
1943
1944         blkno = path_leaf_bh(path)->b_blocknr;
1945
1946         /* Start at the tree node just above the leaf and work our way up. */
1947         i = path->p_tree_depth - 1;
1948         while (i >= 0) {
1949                 el = path->p_node[i].el;
1950
1951                 /*
1952                  * Find the extent record just before the one in our
1953                  * path.
1954                  */
1955                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1956                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1957                                 if (j == 0) {
1958                                         if (i == 0) {
1959                                                 /*
1960                                                  * We've determined that the
1961                                                  * path specified is already
1962                                                  * the leftmost one - return a
1963                                                  * cpos of zero.
1964                                                  */
1965                                                 goto out;
1966                                         }
1967                                         /*
1968                                          * The leftmost record points to our
1969                                          * leaf - we need to travel up the
1970                                          * tree one level.
1971                                          */
1972                                         goto next_node;
1973                                 }
1974
1975                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1976                                 *cpos = *cpos + ocfs2_rec_clusters(el,
1977                                                            &el->l_recs[j - 1]);
1978                                 *cpos = *cpos - 1;
1979                                 goto out;
1980                         }
1981                 }
1982
1983                 /*
1984                  * If we got here, we never found a valid node where
1985                  * the tree indicated one should be.
1986                  */
1987                 ocfs2_error(sb,
1988                             "Invalid extent tree at extent block %llu\n",
1989                             (unsigned long long)blkno);
1990                 ret = -EROFS;
1991                 goto out;
1992
1993 next_node:
1994                 blkno = path->p_node[i].bh->b_blocknr;
1995                 i--;
1996         }
1997
1998 out:
1999         return ret;
2000 }
2001
2002 /*
2003  * Extend the transaction by enough credits to complete the rotation,
2004  * and still leave at least the original number of credits allocated
2005  * to this transaction.
2006  */
2007 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2008                                            int op_credits,
2009                                            struct ocfs2_path *path)
2010 {
2011         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2012
2013         if (handle->h_buffer_credits < credits)
2014                 return ocfs2_extend_trans(handle, credits);
2015
2016         return 0;
2017 }
2018
2019 /*
2020  * Trap the case where we're inserting into the theoretical range past
2021  * the _actual_ left leaf range. Otherwise, we'll rotate a record
2022  * whose cpos is less than ours into the right leaf.
2023  *
2024  * It's only necessary to look at the rightmost record of the left
2025  * leaf because the logic that calls us should ensure that the
2026  * theoretical ranges in the path components above the leaves are
2027  * correct.
2028  */
2029 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2030                                                  u32 insert_cpos)
2031 {
2032         struct ocfs2_extent_list *left_el;
2033         struct ocfs2_extent_rec *rec;
2034         int next_free;
2035
2036         left_el = path_leaf_el(left_path);
2037         next_free = le16_to_cpu(left_el->l_next_free_rec);
2038         rec = &left_el->l_recs[next_free - 1];
2039
2040         if (insert_cpos > le32_to_cpu(rec->e_cpos))
2041                 return 1;
2042         return 0;
2043 }
2044
2045 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2046 {
2047         int next_free = le16_to_cpu(el->l_next_free_rec);
2048         unsigned int range;
2049         struct ocfs2_extent_rec *rec;
2050
2051         if (next_free == 0)
2052                 return 0;
2053
2054         rec = &el->l_recs[0];
2055         if (ocfs2_is_empty_extent(rec)) {
2056                 /* Empty list. */
2057                 if (next_free == 1)
2058                         return 0;
2059                 rec = &el->l_recs[1];
2060         }
2061
2062         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2063         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2064                 return 1;
2065         return 0;
2066 }
2067
2068 /*
2069  * Rotate all the records in a btree right one record, starting at insert_cpos.
2070  *
2071  * The path to the rightmost leaf should be passed in.
2072  *
2073  * The array is assumed to be large enough to hold an entire path (tree depth).
2074  *
2075  * Upon succesful return from this function:
2076  *
2077  * - The 'right_path' array will contain a path to the leaf block
2078  *   whose range contains e_cpos.
2079  * - That leaf block will have a single empty extent in list index 0.
2080  * - In the case that the rotation requires a post-insert update,
2081  *   *ret_left_path will contain a valid path which can be passed to
2082  *   ocfs2_insert_path().
2083  */
2084 static int ocfs2_rotate_tree_right(struct inode *inode,
2085                                    handle_t *handle,
2086                                    enum ocfs2_split_type split,
2087                                    u32 insert_cpos,
2088                                    struct ocfs2_path *right_path,
2089                                    struct ocfs2_path **ret_left_path)
2090 {
2091         int ret, start, orig_credits = handle->h_buffer_credits;
2092         u32 cpos;
2093         struct ocfs2_path *left_path = NULL;
2094
2095         *ret_left_path = NULL;
2096
2097         left_path = ocfs2_new_path(path_root_bh(right_path),
2098                                    path_root_el(right_path));
2099         if (!left_path) {
2100                 ret = -ENOMEM;
2101                 mlog_errno(ret);
2102                 goto out;
2103         }
2104
2105         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
2106         if (ret) {
2107                 mlog_errno(ret);
2108                 goto out;
2109         }
2110
2111         mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
2112
2113         /*
2114          * What we want to do here is:
2115          *
2116          * 1) Start with the rightmost path.
2117          *
2118          * 2) Determine a path to the leaf block directly to the left
2119          *    of that leaf.
2120          *
2121          * 3) Determine the 'subtree root' - the lowest level tree node
2122          *    which contains a path to both leaves.
2123          *
2124          * 4) Rotate the subtree.
2125          *
2126          * 5) Find the next subtree by considering the left path to be
2127          *    the new right path.
2128          *
2129          * The check at the top of this while loop also accepts
2130          * insert_cpos == cpos because cpos is only a _theoretical_
2131          * value to get us the left path - insert_cpos might very well
2132          * be filling that hole.
2133          *
2134          * Stop at a cpos of '0' because we either started at the
2135          * leftmost branch (i.e., a tree with one branch and a
2136          * rotation inside of it), or we've gone as far as we can in
2137          * rotating subtrees.
2138          */
2139         while (cpos && insert_cpos <= cpos) {
2140                 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2141                      insert_cpos, cpos);
2142
2143                 ret = ocfs2_find_path(inode, left_path, cpos);
2144                 if (ret) {
2145                         mlog_errno(ret);
2146                         goto out;
2147                 }
2148
2149                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
2150                                 path_leaf_bh(right_path),
2151                                 "Inode %lu: error during insert of %u "
2152                                 "(left path cpos %u) results in two identical "
2153                                 "paths ending at %llu\n",
2154                                 inode->i_ino, insert_cpos, cpos,
2155                                 (unsigned long long)
2156                                 path_leaf_bh(left_path)->b_blocknr);
2157
2158                 if (split == SPLIT_NONE &&
2159                     ocfs2_rotate_requires_path_adjustment(left_path,
2160                                                           insert_cpos)) {
2161
2162                         /*
2163                          * We've rotated the tree as much as we
2164                          * should. The rest is up to
2165                          * ocfs2_insert_path() to complete, after the
2166                          * record insertion. We indicate this
2167                          * situation by returning the left path.
2168                          *
2169                          * The reason we don't adjust the records here
2170                          * before the record insert is that an error
2171                          * later might break the rule where a parent
2172                          * record e_cpos will reflect the actual
2173                          * e_cpos of the 1st nonempty record of the
2174                          * child list.
2175                          */
2176                         *ret_left_path = left_path;
2177                         goto out_ret_path;
2178                 }
2179
2180                 start = ocfs2_find_subtree_root(inode, left_path, right_path);
2181
2182                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2183                      start,
2184                      (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2185                      right_path->p_tree_depth);
2186
2187                 ret = ocfs2_extend_rotate_transaction(handle, start,
2188                                                       orig_credits, right_path);
2189                 if (ret) {
2190                         mlog_errno(ret);
2191                         goto out;
2192                 }
2193
2194                 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
2195                                                  right_path, start);
2196                 if (ret) {
2197                         mlog_errno(ret);
2198                         goto out;
2199                 }
2200
2201                 if (split != SPLIT_NONE &&
2202                     ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2203                                                 insert_cpos)) {
2204                         /*
2205                          * A rotate moves the rightmost left leaf
2206                          * record over to the leftmost right leaf
2207                          * slot. If we're doing an extent split
2208                          * instead of a real insert, then we have to
2209                          * check that the extent to be split wasn't
2210                          * just moved over. If it was, then we can
2211                          * exit here, passing left_path back -
2212                          * ocfs2_split_extent() is smart enough to
2213                          * search both leaves.
2214                          */
2215                         *ret_left_path = left_path;
2216                         goto out_ret_path;
2217                 }
2218
2219                 /*
2220                  * There is no need to re-read the next right path
2221                  * as we know that it'll be our current left
2222                  * path. Optimize by copying values instead.
2223                  */
2224                 ocfs2_mv_path(right_path, left_path);
2225
2226                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
2227                                                     &cpos);
2228                 if (ret) {
2229                         mlog_errno(ret);
2230                         goto out;
2231                 }
2232         }
2233
2234 out:
2235         ocfs2_free_path(left_path);
2236
2237 out_ret_path:
2238         return ret;
2239 }
2240
2241 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2242                                       struct ocfs2_path *path)
2243 {
2244         int i, idx;
2245         struct ocfs2_extent_rec *rec;
2246         struct ocfs2_extent_list *el;
2247         struct ocfs2_extent_block *eb;
2248         u32 range;
2249
2250         /* Path should always be rightmost. */
2251         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2252         BUG_ON(eb->h_next_leaf_blk != 0ULL);
2253
2254         el = &eb->h_list;
2255         BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2256         idx = le16_to_cpu(el->l_next_free_rec) - 1;
2257         rec = &el->l_recs[idx];
2258         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2259
2260         for (i = 0; i < path->p_tree_depth; i++) {
2261                 el = path->p_node[i].el;
2262                 idx = le16_to_cpu(el->l_next_free_rec) - 1;
2263                 rec = &el->l_recs[idx];
2264
2265                 rec->e_int_clusters = cpu_to_le32(range);
2266                 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2267
2268                 ocfs2_journal_dirty(handle, path->p_node[i].bh);
2269         }
2270 }
2271
2272 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2273                               struct ocfs2_cached_dealloc_ctxt *dealloc,
2274                               struct ocfs2_path *path, int unlink_start)
2275 {
2276         int ret, i;
2277         struct ocfs2_extent_block *eb;
2278         struct ocfs2_extent_list *el;
2279         struct buffer_head *bh;
2280
2281         for(i = unlink_start; i < path_num_items(path); i++) {
2282                 bh = path->p_node[i].bh;
2283
2284                 eb = (struct ocfs2_extent_block *)bh->b_data;
2285                 /*
2286                  * Not all nodes might have had their final count
2287                  * decremented by the caller - handle this here.
2288                  */
2289                 el = &eb->h_list;
2290                 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2291                         mlog(ML_ERROR,
2292                              "Inode %llu, attempted to remove extent block "
2293                              "%llu with %u records\n",
2294                              (unsigned long long)OCFS2_I(inode)->ip_blkno,
2295                              (unsigned long long)le64_to_cpu(eb->h_blkno),
2296                              le16_to_cpu(el->l_next_free_rec));
2297
2298                         ocfs2_journal_dirty(handle, bh);
2299                         ocfs2_remove_from_cache(inode, bh);
2300                         continue;
2301                 }
2302
2303                 el->l_next_free_rec = 0;
2304                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2305
2306                 ocfs2_journal_dirty(handle, bh);
2307
2308                 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2309                 if (ret)
2310                         mlog_errno(ret);
2311
2312                 ocfs2_remove_from_cache(inode, bh);
2313         }
2314 }
2315
2316 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2317                                  struct ocfs2_path *left_path,
2318                                  struct ocfs2_path *right_path,
2319                                  int subtree_index,
2320                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
2321 {
2322         int i;
2323         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2324         struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2325         struct ocfs2_extent_list *el;
2326         struct ocfs2_extent_block *eb;
2327
2328         el = path_leaf_el(left_path);
2329
2330         eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2331
2332         for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2333                 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2334                         break;
2335
2336         BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2337
2338         memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2339         le16_add_cpu(&root_el->l_next_free_rec, -1);
2340
2341         eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2342         eb->h_next_leaf_blk = 0;
2343
2344         ocfs2_journal_dirty(handle, root_bh);
2345         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2346
2347         ocfs2_unlink_path(inode, handle, dealloc, right_path,
2348                           subtree_index + 1);
2349 }
2350
2351 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2352                                      struct ocfs2_path *left_path,
2353                                      struct ocfs2_path *right_path,
2354                                      int subtree_index,
2355                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
2356                                      int *deleted,
2357                                      struct ocfs2_extent_tree *et)
2358 {
2359         int ret, i, del_right_subtree = 0, right_has_empty = 0;
2360         struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2361         struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2362         struct ocfs2_extent_block *eb;
2363
2364         *deleted = 0;
2365
2366         right_leaf_el = path_leaf_el(right_path);
2367         left_leaf_el = path_leaf_el(left_path);
2368         root_bh = left_path->p_node[subtree_index].bh;
2369         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2370
2371         if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2372                 return 0;
2373
2374         eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2375         if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2376                 /*
2377                  * It's legal for us to proceed if the right leaf is
2378                  * the rightmost one and it has an empty extent. There
2379                  * are two cases to handle - whether the leaf will be
2380                  * empty after removal or not. If the leaf isn't empty
2381                  * then just remove the empty extent up front. The
2382                  * next block will handle empty leaves by flagging
2383                  * them for unlink.
2384                  *
2385                  * Non rightmost leaves will throw -EAGAIN and the
2386                  * caller can manually move the subtree and retry.
2387                  */
2388
2389                 if (eb->h_next_leaf_blk != 0ULL)
2390                         return -EAGAIN;
2391
2392                 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2393                         ret = ocfs2_journal_access(handle, inode,
2394                                                    path_leaf_bh(right_path),
2395                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2396                         if (ret) {
2397                                 mlog_errno(ret);
2398                                 goto out;
2399                         }
2400
2401                         ocfs2_remove_empty_extent(right_leaf_el);
2402                 } else
2403                         right_has_empty = 1;
2404         }
2405
2406         if (eb->h_next_leaf_blk == 0ULL &&
2407             le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2408                 /*
2409                  * We have to update i_last_eb_blk during the meta
2410                  * data delete.
2411                  */
2412                 ret = ocfs2_journal_access(handle, inode, et_root_bh,
2413                                            OCFS2_JOURNAL_ACCESS_WRITE);
2414                 if (ret) {
2415                         mlog_errno(ret);
2416                         goto out;
2417                 }
2418
2419                 del_right_subtree = 1;
2420         }
2421
2422         /*
2423          * Getting here with an empty extent in the right path implies
2424          * that it's the rightmost path and will be deleted.
2425          */
2426         BUG_ON(right_has_empty && !del_right_subtree);
2427
2428         ret = ocfs2_journal_access(handle, inode, root_bh,
2429                                    OCFS2_JOURNAL_ACCESS_WRITE);
2430         if (ret) {
2431                 mlog_errno(ret);
2432                 goto out;
2433         }
2434
2435         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2436                 ret = ocfs2_journal_access(handle, inode,
2437                                            right_path->p_node[i].bh,
2438                                            OCFS2_JOURNAL_ACCESS_WRITE);
2439                 if (ret) {
2440                         mlog_errno(ret);
2441                         goto out;
2442                 }
2443
2444                 ret = ocfs2_journal_access(handle, inode,
2445                                            left_path->p_node[i].bh,
2446                                            OCFS2_JOURNAL_ACCESS_WRITE);
2447                 if (ret) {
2448                         mlog_errno(ret);
2449                         goto out;
2450                 }
2451         }
2452
2453         if (!right_has_empty) {
2454                 /*
2455                  * Only do this if we're moving a real
2456                  * record. Otherwise, the action is delayed until
2457                  * after removal of the right path in which case we
2458                  * can do a simple shift to remove the empty extent.
2459                  */
2460                 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2461                 memset(&right_leaf_el->l_recs[0], 0,
2462                        sizeof(struct ocfs2_extent_rec));
2463         }
2464         if (eb->h_next_leaf_blk == 0ULL) {
2465                 /*
2466                  * Move recs over to get rid of empty extent, decrease
2467                  * next_free. This is allowed to remove the last
2468                  * extent in our leaf (setting l_next_free_rec to
2469                  * zero) - the delete code below won't care.
2470                  */
2471                 ocfs2_remove_empty_extent(right_leaf_el);
2472         }
2473
2474         ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2475         if (ret)
2476                 mlog_errno(ret);
2477         ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2478         if (ret)
2479                 mlog_errno(ret);
2480
2481         if (del_right_subtree) {
2482                 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2483                                      subtree_index, dealloc);
2484                 ocfs2_update_edge_lengths(inode, handle, left_path);
2485
2486                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2487                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2488
2489                 /*
2490                  * Removal of the extent in the left leaf was skipped
2491                  * above so we could delete the right path
2492                  * 1st.
2493                  */
2494                 if (right_has_empty)
2495                         ocfs2_remove_empty_extent(left_leaf_el);
2496
2497                 ret = ocfs2_journal_dirty(handle, et_root_bh);
2498                 if (ret)
2499                         mlog_errno(ret);
2500
2501                 *deleted = 1;
2502         } else
2503                 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2504                                            subtree_index);
2505
2506 out:
2507         return ret;
2508 }
2509
2510 /*
2511  * Given a full path, determine what cpos value would return us a path
2512  * containing the leaf immediately to the right of the current one.
2513  *
2514  * Will return zero if the path passed in is already the rightmost path.
2515  *
2516  * This looks similar, but is subtly different to
2517  * ocfs2_find_cpos_for_left_leaf().
2518  */
2519 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2520                                           struct ocfs2_path *path, u32 *cpos)
2521 {
2522         int i, j, ret = 0;
2523         u64 blkno;
2524         struct ocfs2_extent_list *el;
2525
2526         *cpos = 0;
2527
2528         if (path->p_tree_depth == 0)
2529                 return 0;
2530
2531         blkno = path_leaf_bh(path)->b_blocknr;
2532
2533         /* Start at the tree node just above the leaf and work our way up. */
2534         i = path->p_tree_depth - 1;
2535         while (i >= 0) {
2536                 int next_free;
2537
2538                 el = path->p_node[i].el;
2539
2540                 /*
2541                  * Find the extent record just after the one in our
2542                  * path.
2543                  */
2544                 next_free = le16_to_cpu(el->l_next_free_rec);
2545                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2546                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2547                                 if (j == (next_free - 1)) {
2548                                         if (i == 0) {
2549                                                 /*
2550                                                  * We've determined that the
2551                                                  * path specified is already
2552                                                  * the rightmost one - return a
2553                                                  * cpos of zero.
2554                                                  */
2555                                                 goto out;
2556                                         }
2557                                         /*
2558                                          * The rightmost record points to our
2559                                          * leaf - we need to travel up the
2560                                          * tree one level.
2561                                          */
2562                                         goto next_node;
2563                                 }
2564
2565                                 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2566                                 goto out;
2567                         }
2568                 }
2569
2570                 /*
2571                  * If we got here, we never found a valid node where
2572                  * the tree indicated one should be.
2573                  */
2574                 ocfs2_error(sb,
2575                             "Invalid extent tree at extent block %llu\n",
2576                             (unsigned long long)blkno);
2577                 ret = -EROFS;
2578                 goto out;
2579
2580 next_node:
2581                 blkno = path->p_node[i].bh->b_blocknr;
2582                 i--;
2583         }
2584
2585 out:
2586         return ret;
2587 }
2588
2589 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2590                                             handle_t *handle,
2591                                             struct buffer_head *bh,
2592                                             struct ocfs2_extent_list *el)
2593 {
2594         int ret;
2595
2596         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2597                 return 0;
2598
2599         ret = ocfs2_journal_access(handle, inode, bh,
2600                                    OCFS2_JOURNAL_ACCESS_WRITE);
2601         if (ret) {
2602                 mlog_errno(ret);
2603                 goto out;
2604         }
2605
2606         ocfs2_remove_empty_extent(el);
2607
2608         ret = ocfs2_journal_dirty(handle, bh);
2609         if (ret)
2610                 mlog_errno(ret);
2611
2612 out:
2613         return ret;
2614 }
2615
2616 static int __ocfs2_rotate_tree_left(struct inode *inode,
2617                                     handle_t *handle, int orig_credits,
2618                                     struct ocfs2_path *path,
2619                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2620                                     struct ocfs2_path **empty_extent_path,
2621                                     struct ocfs2_extent_tree *et)
2622 {
2623         int ret, subtree_root, deleted;
2624         u32 right_cpos;
2625         struct ocfs2_path *left_path = NULL;
2626         struct ocfs2_path *right_path = NULL;
2627
2628         BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2629
2630         *empty_extent_path = NULL;
2631
2632         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2633                                              &right_cpos);
2634         if (ret) {
2635                 mlog_errno(ret);
2636                 goto out;
2637         }
2638
2639         left_path = ocfs2_new_path(path_root_bh(path),
2640                                    path_root_el(path));
2641         if (!left_path) {
2642                 ret = -ENOMEM;
2643                 mlog_errno(ret);
2644                 goto out;
2645         }
2646
2647         ocfs2_cp_path(left_path, path);
2648
2649         right_path = ocfs2_new_path(path_root_bh(path),
2650                                     path_root_el(path));
2651         if (!right_path) {
2652                 ret = -ENOMEM;
2653                 mlog_errno(ret);
2654                 goto out;
2655         }
2656
2657         while (right_cpos) {
2658                 ret = ocfs2_find_path(inode, right_path, right_cpos);
2659                 if (ret) {
2660                         mlog_errno(ret);
2661                         goto out;
2662                 }
2663
2664                 subtree_root = ocfs2_find_subtree_root(inode, left_path,
2665                                                        right_path);
2666
2667                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2668                      subtree_root,
2669                      (unsigned long long)
2670                      right_path->p_node[subtree_root].bh->b_blocknr,
2671                      right_path->p_tree_depth);
2672
2673                 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2674                                                       orig_credits, left_path);
2675                 if (ret) {
2676                         mlog_errno(ret);
2677                         goto out;
2678                 }
2679
2680                 /*
2681                  * Caller might still want to make changes to the
2682                  * tree root, so re-add it to the journal here.
2683                  */
2684                 ret = ocfs2_journal_access(handle, inode,
2685                                            path_root_bh(left_path),
2686                                            OCFS2_JOURNAL_ACCESS_WRITE);
2687                 if (ret) {
2688                         mlog_errno(ret);
2689                         goto out;
2690                 }
2691
2692                 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2693                                                 right_path, subtree_root,
2694                                                 dealloc, &deleted, et);
2695                 if (ret == -EAGAIN) {
2696                         /*
2697                          * The rotation has to temporarily stop due to
2698                          * the right subtree having an empty
2699                          * extent. Pass it back to the caller for a
2700                          * fixup.
2701                          */
2702                         *empty_extent_path = right_path;
2703                         right_path = NULL;
2704                         goto out;
2705                 }
2706                 if (ret) {
2707                         mlog_errno(ret);
2708                         goto out;
2709                 }
2710
2711                 /*
2712                  * The subtree rotate might have removed records on
2713                  * the rightmost edge. If so, then rotation is
2714                  * complete.
2715                  */
2716                 if (deleted)
2717                         break;
2718
2719                 ocfs2_mv_path(left_path, right_path);
2720
2721                 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2722                                                      &right_cpos);
2723                 if (ret) {
2724                         mlog_errno(ret);
2725                         goto out;
2726                 }
2727         }
2728
2729 out:
2730         ocfs2_free_path(right_path);
2731         ocfs2_free_path(left_path);
2732
2733         return ret;
2734 }
2735
2736 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2737                                 struct ocfs2_path *path,
2738                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
2739                                 struct ocfs2_extent_tree *et)
2740 {
2741         int ret, subtree_index;
2742         u32 cpos;
2743         struct ocfs2_path *left_path = NULL;
2744         struct ocfs2_extent_block *eb;
2745         struct ocfs2_extent_list *el;
2746
2747
2748         ret = ocfs2_et_sanity_check(inode, et);
2749         if (ret)
2750                 goto out;
2751         /*
2752          * There's two ways we handle this depending on
2753          * whether path is the only existing one.
2754          */
2755         ret = ocfs2_extend_rotate_transaction(handle, 0,
2756                                               handle->h_buffer_credits,
2757                                               path);
2758         if (ret) {
2759                 mlog_errno(ret);
2760                 goto out;
2761         }
2762
2763         ret = ocfs2_journal_access_path(inode, handle, path);
2764         if (ret) {
2765                 mlog_errno(ret);
2766                 goto out;
2767         }
2768
2769         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2770         if (ret) {
2771                 mlog_errno(ret);
2772                 goto out;
2773         }
2774
2775         if (cpos) {
2776                 /*
2777                  * We have a path to the left of this one - it needs
2778                  * an update too.
2779                  */
2780                 left_path = ocfs2_new_path(path_root_bh(path),
2781                                            path_root_el(path));
2782                 if (!left_path) {
2783                         ret = -ENOMEM;
2784                         mlog_errno(ret);
2785                         goto out;
2786                 }
2787
2788                 ret = ocfs2_find_path(inode, left_path, cpos);
2789                 if (ret) {
2790                         mlog_errno(ret);
2791                         goto out;
2792                 }
2793
2794                 ret = ocfs2_journal_access_path(inode, handle, left_path);
2795                 if (ret) {
2796                         mlog_errno(ret);
2797                         goto out;
2798                 }
2799
2800                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2801
2802                 ocfs2_unlink_subtree(inode, handle, left_path, path,
2803                                      subtree_index, dealloc);
2804                 ocfs2_update_edge_lengths(inode, handle, left_path);
2805
2806                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2807                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2808         } else {
2809                 /*
2810                  * 'path' is also the leftmost path which
2811                  * means it must be the only one. This gets
2812                  * handled differently because we want to
2813                  * revert the inode back to having extents
2814                  * in-line.
2815                  */
2816                 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2817
2818                 el = et->et_root_el;
2819                 el->l_tree_depth = 0;
2820                 el->l_next_free_rec = 0;
2821                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2822
2823                 ocfs2_et_set_last_eb_blk(et, 0);
2824         }
2825
2826         ocfs2_journal_dirty(handle, path_root_bh(path));
2827
2828 out:
2829         ocfs2_free_path(left_path);
2830         return ret;
2831 }
2832
2833 /*
2834  * Left rotation of btree records.
2835  *
2836  * In many ways, this is (unsurprisingly) the opposite of right
2837  * rotation. We start at some non-rightmost path containing an empty
2838  * extent in the leaf block. The code works its way to the rightmost
2839  * path by rotating records to the left in every subtree.
2840  *
2841  * This is used by any code which reduces the number of extent records
2842  * in a leaf. After removal, an empty record should be placed in the
2843  * leftmost list position.
2844  *
2845  * This won't handle a length update of the rightmost path records if
2846  * the rightmost tree leaf record is removed so the caller is
2847  * responsible for detecting and correcting that.
2848  */
2849 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2850                                   struct ocfs2_path *path,
2851                                   struct ocfs2_cached_dealloc_ctxt *dealloc,
2852                                   struct ocfs2_extent_tree *et)
2853 {
2854         int ret, orig_credits = handle->h_buffer_credits;
2855         struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2856         struct ocfs2_extent_block *eb;
2857         struct ocfs2_extent_list *el;
2858
2859         el = path_leaf_el(path);
2860         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2861                 return 0;
2862
2863         if (path->p_tree_depth == 0) {
2864 rightmost_no_delete:
2865                 /*
2866                  * Inline extents. This is trivially handled, so do
2867                  * it up front.
2868                  */
2869                 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2870                                                        path_leaf_bh(path),
2871                                                        path_leaf_el(path));
2872                 if (ret)
2873                         mlog_errno(ret);
2874                 goto out;
2875         }
2876
2877         /*
2878          * Handle rightmost branch now. There's several cases:
2879          *  1) simple rotation leaving records in there. That's trivial.
2880          *  2) rotation requiring a branch delete - there's no more
2881          *     records left. Two cases of this:
2882          *     a) There are branches to the left.
2883          *     b) This is also the leftmost (the only) branch.
2884          *
2885          *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
2886          *  2a) we need the left branch so that we can update it with the unlink
2887          *  2b) we need to bring the inode back to inline extents.
2888          */
2889
2890         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2891         el = &eb->h_list;
2892         if (eb->h_next_leaf_blk == 0) {
2893                 /*
2894                  * This gets a bit tricky if we're going to delete the
2895                  * rightmost path. Get the other cases out of the way
2896                  * 1st.
2897                  */
2898                 if (le16_to_cpu(el->l_next_free_rec) > 1)
2899                         goto rightmost_no_delete;
2900
2901                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
2902                         ret = -EIO;
2903                         ocfs2_error(inode->i_sb,
2904                                     "Inode %llu has empty extent block at %llu",
2905                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
2906                                     (unsigned long long)le64_to_cpu(eb->h_blkno));
2907                         goto out;
2908                 }
2909
2910                 /*
2911                  * XXX: The caller can not trust "path" any more after
2912                  * this as it will have been deleted. What do we do?
2913                  *
2914                  * In theory the rotate-for-merge code will never get
2915                  * here because it'll always ask for a rotate in a
2916                  * nonempty list.
2917                  */
2918
2919                 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2920                                                   dealloc, et);
2921                 if (ret)
2922                         mlog_errno(ret);
2923                 goto out;
2924         }
2925
2926         /*
2927          * Now we can loop, remembering the path we get from -EAGAIN
2928          * and restarting from there.
2929          */
2930 try_rotate:
2931         ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2932                                        dealloc, &restart_path, et);
2933         if (ret && ret != -EAGAIN) {
2934                 mlog_errno(ret);
2935                 goto out;
2936         }
2937
2938         while (ret == -EAGAIN) {
2939                 tmp_path = restart_path;
2940                 restart_path = NULL;
2941
2942                 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2943                                                tmp_path, dealloc,
2944                                                &restart_path, et);
2945                 if (ret && ret != -EAGAIN) {
2946                         mlog_errno(ret);
2947                         goto out;
2948                 }
2949
2950                 ocfs2_free_path(tmp_path);
2951                 tmp_path = NULL;
2952
2953                 if (ret == 0)
2954                         goto try_rotate;
2955         }
2956
2957 out:
2958         ocfs2_free_path(tmp_path);
2959         ocfs2_free_path(restart_path);
2960         return ret;
2961 }
2962
2963 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2964                                 int index)
2965 {
2966         struct ocfs2_extent_rec *rec = &el->l_recs[index];
2967         unsigned int size;
2968
2969         if (rec->e_leaf_clusters == 0) {
2970                 /*
2971                  * We consumed all of the merged-from record. An empty
2972                  * extent cannot exist anywhere but the 1st array
2973                  * position, so move things over if the merged-from
2974                  * record doesn't occupy that position.
2975                  *
2976                  * This creates a new empty extent so the caller
2977                  * should be smart enough to have removed any existing
2978                  * ones.
2979                  */
2980                 if (index > 0) {
2981                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2982                         size = index * sizeof(struct ocfs2_extent_rec);
2983                         memmove(&el->l_recs[1], &el->l_recs[0], size);
2984                 }
2985
2986                 /*
2987                  * Always memset - the caller doesn't check whether it
2988                  * created an empty extent, so there could be junk in
2989                  * the other fields.
2990                  */
2991                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2992         }
2993 }
2994
2995 static int ocfs2_get_right_path(struct inode *inode,
2996                                 struct ocfs2_path *left_path,
2997                                 struct ocfs2_path **ret_right_path)
2998 {
2999         int ret;
3000         u32 right_cpos;
3001         struct ocfs2_path *right_path = NULL;
3002         struct ocfs2_extent_list *left_el;
3003
3004         *ret_right_path = NULL;
3005
3006         /* This function shouldn't be called for non-trees. */
3007         BUG_ON(left_path->p_tree_depth == 0);
3008
3009         left_el = path_leaf_el(left_path);
3010         BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3011
3012         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
3013                                              &right_cpos);
3014         if (ret) {
3015                 mlog_errno(ret);
3016                 goto out;
3017         }
3018
3019         /* This function shouldn't be called for the rightmost leaf. */
3020         BUG_ON(right_cpos == 0);
3021
3022         right_path = ocfs2_new_path(path_root_bh(left_path),
3023                                     path_root_el(left_path));
3024         if (!right_path) {
3025                 ret = -ENOMEM;
3026                 mlog_errno(ret);
3027                 goto out;
3028         }
3029
3030         ret = ocfs2_find_path(inode, right_path, right_cpos);
3031         if (ret) {
3032                 mlog_errno(ret);
3033                 goto out;
3034         }
3035
3036         *ret_right_path = right_path;
3037 out:
3038         if (ret)
3039                 ocfs2_free_path(right_path);
3040         return ret;
3041 }
3042
3043 /*
3044  * Remove split_rec clusters from the record at index and merge them
3045  * onto the beginning of the record "next" to it.
3046  * For index < l_count - 1, the next means the extent rec at index + 1.
3047  * For index == l_count - 1, the "next" means the 1st extent rec of the
3048  * next extent block.
3049  */
3050 static int ocfs2_merge_rec_right(struct inode *inode,
3051                                  struct ocfs2_path *left_path,
3052                                  handle_t *handle,
3053                                  struct ocfs2_extent_rec *split_rec,
3054                                  int index)
3055 {
3056         int ret, next_free, i;
3057         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3058         struct ocfs2_extent_rec *left_rec;
3059         struct ocfs2_extent_rec *right_rec;
3060         struct ocfs2_extent_list *right_el;
3061         struct ocfs2_path *right_path = NULL;
3062         int subtree_index = 0;
3063         struct ocfs2_extent_list *el = path_leaf_el(left_path);
3064         struct buffer_head *bh = path_leaf_bh(left_path);
3065         struct buffer_head *root_bh = NULL;
3066
3067         BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3068         left_rec = &el->l_recs[index];
3069
3070         if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3071             le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3072                 /* we meet with a cross extent block merge. */
3073                 ret = ocfs2_get_right_path(inode, left_path, &right_path);
3074                 if (ret) {
3075                         mlog_errno(ret);
3076                         goto out;
3077                 }
3078
3079                 right_el = path_leaf_el(right_path);
3080                 next_free = le16_to_cpu(right_el->l_next_free_rec);
3081                 BUG_ON(next_free <= 0);
3082                 right_rec = &right_el->l_recs[0];
3083                 if (ocfs2_is_empty_extent(right_rec)) {
3084                         BUG_ON(next_free <= 1);
3085                         right_rec = &right_el->l_recs[1];
3086                 }
3087
3088                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3089                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3090                        le32_to_cpu(right_rec->e_cpos));
3091
3092                 subtree_index = ocfs2_find_subtree_root(inode,
3093                                                         left_path, right_path);
3094
3095                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3096                                                       handle->h_buffer_credits,
3097                                                       right_path);
3098                 if (ret) {
3099                         mlog_errno(ret);
3100                         goto out;
3101                 }
3102
3103                 root_bh = left_path->p_node[subtree_index].bh;
3104                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3105
3106                 ret = ocfs2_journal_access(handle, inode, root_bh,
3107                                            OCFS2_JOURNAL_ACCESS_WRITE);
3108                 if (ret) {
3109                         mlog_errno(ret);
3110                         goto out;
3111                 }
3112
3113                 for (i = subtree_index + 1;
3114                      i < path_num_items(right_path); i++) {
3115                         ret = ocfs2_journal_access(handle, inode,
3116                                                    right_path->p_node[i].bh,
3117                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3118                         if (ret) {
3119                                 mlog_errno(ret);
3120                                 goto out;
3121                         }
3122
3123                         ret = ocfs2_journal_access(handle, inode,
3124                                                    left_path->p_node[i].bh,
3125                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3126                         if (ret) {
3127                                 mlog_errno(ret);
3128                                 goto out;
3129                         }
3130                 }
3131
3132         } else {
3133                 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3134                 right_rec = &el->l_recs[index + 1];
3135         }
3136
3137         ret = ocfs2_journal_access(handle, inode, bh,
3138                                    OCFS2_JOURNAL_ACCESS_WRITE);
3139         if (ret) {
3140                 mlog_errno(ret);
3141                 goto out;
3142         }
3143
3144         le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3145
3146         le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3147         le64_add_cpu(&right_rec->e_blkno,
3148                      -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3149         le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3150
3151         ocfs2_cleanup_merge(el, index);
3152
3153         ret = ocfs2_journal_dirty(handle, bh);
3154         if (ret)
3155                 mlog_errno(ret);
3156
3157         if (right_path) {
3158                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3159                 if (ret)
3160                         mlog_errno(ret);
3161
3162                 ocfs2_complete_edge_insert(inode, handle, left_path,
3163                                            right_path, subtree_index);
3164         }
3165 out:
3166         if (right_path)
3167                 ocfs2_free_path(right_path);
3168         return ret;
3169 }
3170
3171 static int ocfs2_get_left_path(struct inode *inode,
3172                                struct ocfs2_path *right_path,
3173                                struct ocfs2_path **ret_left_path)
3174 {
3175         int ret;
3176         u32 left_cpos;
3177         struct ocfs2_path *left_path = NULL;
3178
3179         *ret_left_path = NULL;
3180
3181         /* This function shouldn't be called for non-trees. */
3182         BUG_ON(right_path->p_tree_depth == 0);
3183
3184         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3185                                             right_path, &left_cpos);
3186         if (ret) {
3187                 mlog_errno(ret);
3188                 goto out;
3189         }
3190
3191         /* This function shouldn't be called for the leftmost leaf. */
3192         BUG_ON(left_cpos == 0);
3193
3194         left_path = ocfs2_new_path(path_root_bh(right_path),
3195                                    path_root_el(right_path));
3196         if (!left_path) {
3197                 ret = -ENOMEM;
3198                 mlog_errno(ret);
3199                 goto out;
3200         }
3201
3202         ret = ocfs2_find_path(inode, left_path, left_cpos);
3203         if (ret) {
3204                 mlog_errno(ret);
3205                 goto out;
3206         }
3207
3208         *ret_left_path = left_path;
3209 out:
3210         if (ret)
3211                 ocfs2_free_path(left_path);
3212         return ret;
3213 }
3214
3215 /*
3216  * Remove split_rec clusters from the record at index and merge them
3217  * onto the tail of the record "before" it.
3218  * For index > 0, the "before" means the extent rec at index - 1.
3219  *
3220  * For index == 0, the "before" means the last record of the previous
3221  * extent block. And there is also a situation that we may need to
3222  * remove the rightmost leaf extent block in the right_path and change
3223  * the right path to indicate the new rightmost path.
3224  */
3225 static int ocfs2_merge_rec_left(struct inode *inode,
3226                                 struct ocfs2_path *right_path,
3227                                 handle_t *handle,
3228                                 struct ocfs2_extent_rec *split_rec,
3229                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
3230                                 struct ocfs2_extent_tree *et,
3231                                 int index)
3232 {
3233         int ret, i, subtree_index = 0, has_empty_extent = 0;
3234         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3235         struct ocfs2_extent_rec *left_rec;
3236         struct ocfs2_extent_rec *right_rec;
3237         struct ocfs2_extent_list *el = path_leaf_el(right_path);
3238         struct buffer_head *bh = path_leaf_bh(right_path);
3239         struct buffer_head *root_bh = NULL;
3240         struct ocfs2_path *left_path = NULL;
3241         struct ocfs2_extent_list *left_el;
3242
3243         BUG_ON(index < 0);
3244
3245         right_rec = &el->l_recs[index];
3246         if (index == 0) {
3247                 /* we meet with a cross extent block merge. */
3248                 ret = ocfs2_get_left_path(inode, right_path, &left_path);
3249                 if (ret) {
3250                         mlog_errno(ret);
3251                         goto out;
3252                 }
3253
3254                 left_el = path_leaf_el(left_path);
3255                 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3256                        le16_to_cpu(left_el->l_count));
3257
3258                 left_rec = &left_el->l_recs[
3259                                 le16_to_cpu(left_el->l_next_free_rec) - 1];
3260                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3261                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3262                        le32_to_cpu(split_rec->e_cpos));
3263
3264                 subtree_index = ocfs2_find_subtree_root(inode,
3265                                                         left_path, right_path);
3266
3267                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3268                                                       handle->h_buffer_credits,
3269                                                       left_path);
3270                 if (ret) {
3271                         mlog_errno(ret);
3272                         goto out;
3273                 }
3274
3275                 root_bh = left_path->p_node[subtree_index].bh;
3276                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3277
3278                 ret = ocfs2_journal_access(handle, inode, root_bh,
3279                                            OCFS2_JOURNAL_ACCESS_WRITE);
3280                 if (ret) {
3281                         mlog_errno(ret);
3282                         goto out;
3283                 }
3284
3285                 for (i = subtree_index + 1;
3286                      i < path_num_items(right_path); i++) {
3287                         ret = ocfs2_journal_access(handle, inode,
3288                                                    right_path->p_node[i].bh,
3289                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3290                         if (ret) {
3291                                 mlog_errno(ret);
3292                                 goto out;
3293                         }
3294
3295                         ret = ocfs2_journal_access(handle, inode,
3296                                                    left_path->p_node[i].bh,
3297                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3298                         if (ret) {
3299                                 mlog_errno(ret);
3300                                 goto out;
3301                         }
3302                 }
3303         } else {
3304                 left_rec = &el->l_recs[index - 1];
3305                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3306                         has_empty_extent = 1;
3307         }
3308
3309         ret = ocfs2_journal_access(handle, inode, bh,
3310                                    OCFS2_JOURNAL_ACCESS_WRITE);
3311         if (ret) {
3312                 mlog_errno(ret);
3313                 goto out;
3314         }
3315
3316         if (has_empty_extent && index == 1) {
3317                 /*
3318                  * The easy case - we can just plop the record right in.
3319                  */
3320                 *left_rec = *split_rec;
3321
3322                 has_empty_extent = 0;
3323         } else
3324                 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3325
3326         le32_add_cpu(&right_rec->e_cpos, split_clusters);
3327         le64_add_cpu(&right_rec->e_blkno,
3328                      ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3329         le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3330
3331         ocfs2_cleanup_merge(el, index);
3332
3333         ret = ocfs2_journal_dirty(handle, bh);
3334         if (ret)
3335                 mlog_errno(ret);
3336
3337         if (left_path) {
3338                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3339                 if (ret)
3340                         mlog_errno(ret);
3341
3342                 /*
3343                  * In the situation that the right_rec is empty and the extent
3344                  * block is empty also,  ocfs2_complete_edge_insert can't handle
3345                  * it and we need to delete the right extent block.
3346                  */
3347                 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3348                     le16_to_cpu(el->l_next_free_rec) == 1) {
3349
3350                         ret = ocfs2_remove_rightmost_path(inode, handle,
3351                                                           right_path,
3352                                                           dealloc, et);
3353                         if (ret) {
3354                                 mlog_errno(ret);
3355                                 goto out;
3356                         }
3357
3358                         /* Now the rightmost extent block has been deleted.
3359                          * So we use the new rightmost path.
3360                          */
3361                         ocfs2_mv_path(right_path, left_path);
3362                         left_path = NULL;
3363                 } else
3364                         ocfs2_complete_edge_insert(inode, handle, left_path,
3365                                                    right_path, subtree_index);
3366         }
3367 out:
3368         if (left_path)
3369                 ocfs2_free_path(left_path);
3370         return ret;
3371 }
3372
3373 static int ocfs2_try_to_merge_extent(struct inode *inode,
3374                                      handle_t *handle,
3375                                      struct ocfs2_path *path,
3376                                      int split_index,
3377                                      struct ocfs2_extent_rec *split_rec,
3378                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
3379                                      struct ocfs2_merge_ctxt *ctxt,
3380                                      struct ocfs2_extent_tree *et)
3381
3382 {
3383         int ret = 0;
3384         struct ocfs2_extent_list *el = path_leaf_el(path);
3385         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3386
3387         BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3388
3389         if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3390                 /*
3391                  * The merge code will need to create an empty
3392                  * extent to take the place of the newly
3393                  * emptied slot. Remove any pre-existing empty
3394                  * extents - having more than one in a leaf is
3395                  * illegal.
3396                  */
3397                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3398                                              dealloc, et);
3399                 if (ret) {
3400                         mlog_errno(ret);
3401                         goto out;
3402                 }
3403                 split_index--;
3404                 rec = &el->l_recs[split_index];
3405         }
3406
3407         if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3408                 /*
3409                  * Left-right contig implies this.
3410                  */
3411                 BUG_ON(!ctxt->c_split_covers_rec);
3412
3413                 /*
3414                  * Since the leftright insert always covers the entire
3415                  * extent, this call will delete the insert record
3416                  * entirely, resulting in an empty extent record added to
3417                  * the extent block.
3418                  *
3419                  * Since the adding of an empty extent shifts
3420                  * everything back to the right, there's no need to
3421                  * update split_index here.
3422                  *
3423                  * When the split_index is zero, we need to merge it to the
3424                  * prevoius extent block. It is more efficient and easier
3425                  * if we do merge_right first and merge_left later.
3426                  */
3427                 ret = ocfs2_merge_rec_right(inode, path,
3428                                             handle, split_rec,
3429                                             split_index);
3430                 if (ret) {
3431                         mlog_errno(ret);
3432                         goto out;
3433                 }
3434
3435                 /*
3436                  * We can only get this from logic error above.
3437                  */
3438                 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3439
3440                 /* The merge left us with an empty extent, remove it. */
3441                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3442                                              dealloc, et);
3443                 if (ret) {
3444                         mlog_errno(ret);
3445                         goto out;
3446                 }
3447
3448                 rec = &el->l_recs[split_index];
3449
3450                 /*
3451                  * Note that we don't pass split_rec here on purpose -
3452                  * we've merged it into the rec already.
3453                  */
3454                 ret = ocfs2_merge_rec_left(inode, path,
3455                                            handle, rec,
3456                                            dealloc, et,
3457                                            split_index);
3458
3459                 if (ret) {
3460                         mlog_errno(ret);
3461                         goto out;
3462                 }
3463
3464                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3465                                              dealloc, et);
3466                 /*
3467                  * Error from this last rotate is not critical, so
3468                  * print but don't bubble it up.
3469                  */
3470                 if (ret)
3471                         mlog_errno(ret);
3472                 ret = 0;
3473         } else {
3474                 /*
3475                  * Merge a record to the left or right.
3476                  *
3477                  * 'contig_type' is relative to the existing record,
3478                  * so for example, if we're "right contig", it's to
3479                  * the record on the left (hence the left merge).
3480                  */
3481                 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3482                         ret = ocfs2_merge_rec_left(inode,
3483                                                    path,
3484                                                    handle, split_rec,
3485                                                    dealloc, et,
3486                                                    split_index);
3487                         if (ret) {
3488                                 mlog_errno(ret);
3489                                 goto out;
3490                         }
3491                 } else {
3492                         ret = ocfs2_merge_rec_right(inode,
3493                                                     path,
3494                                                     handle, split_rec,
3495                                                     split_index);
3496                         if (ret) {
3497                                 mlog_errno(ret);
3498                                 goto out;
3499                         }
3500                 }
3501
3502                 if (ctxt->c_split_covers_rec) {
3503                         /*
3504                          * The merge may have left an empty extent in
3505                          * our leaf. Try to rotate it away.
3506                          */
3507                         ret = ocfs2_rotate_tree_left(inode, handle, path,
3508                                                      dealloc, et);
3509                         if (ret)
3510                                 mlog_errno(ret);
3511                         ret = 0;
3512                 }
3513         }
3514
3515 out:
3516         return ret;
3517 }
3518
3519 static void ocfs2_subtract_from_rec(struct super_block *sb,
3520                                     enum ocfs2_split_type split,
3521                                     struct ocfs2_extent_rec *rec,
3522                                     struct ocfs2_extent_rec *split_rec)
3523 {
3524         u64 len_blocks;
3525
3526         len_blocks = ocfs2_clusters_to_blocks(sb,
3527                                 le16_to_cpu(split_rec->e_leaf_clusters));
3528
3529         if (split == SPLIT_LEFT) {
3530                 /*
3531                  * Region is on the left edge of the existing
3532                  * record.
3533                  */
3534                 le32_add_cpu(&rec->e_cpos,
3535                              le16_to_cpu(split_rec->e_leaf_clusters));
3536                 le64_add_cpu(&rec->e_blkno, len_blocks);
3537                 le16_add_cpu(&rec->e_leaf_clusters,
3538                              -le16_to_cpu(split_rec->e_leaf_clusters));
3539         } else {
3540                 /*
3541                  * Region is on the right edge of the existing
3542                  * record.
3543                  */
3544                 le16_add_cpu(&rec->e_leaf_clusters,
3545                              -le16_to_cpu(split_rec->e_leaf_clusters));
3546         }
3547 }
3548
3549 /*
3550  * Do the final bits of extent record insertion at the target leaf
3551  * list. If this leaf is part of an allocation tree, it is assumed
3552  * that the tree above has been prepared.
3553  */
3554 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3555                                  struct ocfs2_extent_list *el,
3556                                  struct ocfs2_insert_type *insert,
3557                                  struct inode *inode)
3558 {
3559         int i = insert->ins_contig_index;
3560         unsigned int range;
3561         struct ocfs2_extent_rec *rec;
3562
3563         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3564
3565         if (insert->ins_split != SPLIT_NONE) {
3566                 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3567                 BUG_ON(i == -1);
3568                 rec = &el->l_recs[i];
3569                 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3570                                         insert_rec);
3571                 goto rotate;
3572         }
3573
3574         /*
3575          * Contiguous insert - either left or right.
3576          */
3577         if (insert->ins_contig != CONTIG_NONE) {
3578                 rec = &el->l_recs[i];
3579                 if (insert->ins_contig == CONTIG_LEFT) {
3580                         rec->e_blkno = insert_rec->e_blkno;
3581                         rec->e_cpos = insert_rec->e_cpos;
3582                 }
3583                 le16_add_cpu(&rec->e_leaf_clusters,
3584                              le16_to_cpu(insert_rec->e_leaf_clusters));
3585                 return;
3586         }
3587
3588         /*
3589          * Handle insert into an empty leaf.
3590          */
3591         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3592             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3593              ocfs2_is_empty_extent(&el->l_recs[0]))) {
3594                 el->l_recs[0] = *insert_rec;
3595                 el->l_next_free_rec = cpu_to_le16(1);
3596                 return;
3597         }
3598
3599         /*
3600          * Appending insert.
3601          */
3602         if (insert->ins_appending == APPEND_TAIL) {
3603                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3604                 rec = &el->l_recs[i];
3605                 range = le32_to_cpu(rec->e_cpos)
3606                         + le16_to_cpu(rec->e_leaf_clusters);
3607                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3608
3609                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3610                                 le16_to_cpu(el->l_count),
3611                                 "inode %lu, depth %u, count %u, next free %u, "
3612                                 "rec.cpos %u, rec.clusters %u, "
3613                                 "insert.cpos %u, insert.clusters %u\n",
3614                                 inode->i_ino,
3615                                 le16_to_cpu(el->l_tree_depth),
3616                                 le16_to_cpu(el->l_count),
3617                                 le16_to_cpu(el->l_next_free_rec),
3618                                 le32_to_cpu(el->l_recs[i].e_cpos),
3619                                 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3620                                 le32_to_cpu(insert_rec->e_cpos),
3621                                 le16_to_cpu(insert_rec->e_leaf_clusters));
3622                 i++;
3623                 el->l_recs[i] = *insert_rec;
3624                 le16_add_cpu(&el->l_next_free_rec, 1);
3625                 return;
3626         }
3627
3628 rotate:
3629         /*
3630          * Ok, we have to rotate.
3631          *
3632          * At this point, it is safe to assume that inserting into an
3633          * empty leaf and appending to a leaf have both been handled
3634          * above.
3635          *
3636          * This leaf needs to have space, either by the empty 1st
3637          * extent record, or by virtue of an l_next_rec < l_count.
3638          */
3639         ocfs2_rotate_leaf(el, insert_rec);
3640 }
3641
3642 static void ocfs2_adjust_rightmost_records(struct inode *inode,
3643                                            handle_t *handle,
3644                                            struct ocfs2_path *path,
3645                                            struct ocfs2_extent_rec *insert_rec)
3646 {
3647         int ret, i, next_free;
3648         struct buffer_head *bh;
3649         struct ocfs2_extent_list *el;
3650         struct ocfs2_extent_rec *rec;
3651
3652         /*
3653          * Update everything except the leaf block.
3654          */
3655         for (i = 0; i < path->p_tree_depth; i++) {
3656                 bh = path->p_node[i].bh;
3657                 el = path->p_node[i].el;
3658
3659                 next_free = le16_to_cpu(el->l_next_free_rec);
3660                 if (next_free == 0) {
3661                         ocfs2_error(inode->i_sb,
3662                                     "Dinode %llu has a bad extent list",
3663                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
3664                         ret = -EIO;
3665                         return;
3666                 }
3667
3668                 rec = &el->l_recs[next_free - 1];
3669
3670                 rec->e_int_clusters = insert_rec->e_cpos;
3671                 le32_add_cpu(&rec->e_int_clusters,
3672                              le16_to_cpu(insert_rec->e_leaf_clusters));
3673                 le32_add_cpu(&rec->e_int_clusters,
3674                              -le32_to_cpu(rec->e_cpos));
3675
3676                 ret = ocfs2_journal_dirty(handle, bh);
3677                 if (ret)
3678                         mlog_errno(ret);
3679
3680         }
3681 }
3682
3683 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3684                                     struct ocfs2_extent_rec *insert_rec,
3685                                     struct ocfs2_path *right_path,
3686                                     struct ocfs2_path **ret_left_path)
3687 {
3688         int ret, next_free;
3689         struct ocfs2_extent_list *el;
3690         struct ocfs2_path *left_path = NULL;
3691
3692         *ret_left_path = NULL;
3693
3694         /*
3695          * This shouldn't happen for non-trees. The extent rec cluster
3696          * count manipulation below only works for interior nodes.
3697          */
3698         BUG_ON(right_path->p_tree_depth == 0);
3699
3700         /*
3701          * If our appending insert is at the leftmost edge of a leaf,
3702          * then we might need to update the rightmost records of the
3703          * neighboring path.
3704          */
3705         el = path_leaf_el(right_path);
3706         next_free = le16_to_cpu(el->l_next_free_rec);
3707         if (next_free == 0 ||
3708             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3709                 u32 left_cpos;
3710
3711                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3712                                                     &left_cpos);
3713                 if (ret) {
3714                         mlog_errno(ret);
3715                         goto out;
3716                 }
3717
3718                 mlog(0, "Append may need a left path update. cpos: %u, "
3719                      "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3720                      left_cpos);
3721
3722                 /*
3723                  * No need to worry if the append is already in the
3724                  * leftmost leaf.
3725                  */
3726                 if (left_cpos) {
3727                         left_path = ocfs2_new_path(path_root_bh(right_path),
3728                                                    path_root_el(right_path));
3729                         if (!left_path) {
3730                                 ret = -ENOMEM;
3731                                 mlog_errno(ret);
3732                                 goto out;
3733                         }
3734
3735                         ret = ocfs2_find_path(inode, left_path, left_cpos);
3736                         if (ret) {
3737                                 mlog_errno(ret);
3738                                 goto out;
3739                         }
3740
3741                         /*
3742                          * ocfs2_insert_path() will pass the left_path to the
3743                          * journal for us.
3744                          */
3745                 }
3746         }
3747
3748         ret = ocfs2_journal_access_path(inode, handle, right_path);
3749         if (ret) {
3750                 mlog_errno(ret);
3751                 goto out;
3752         }
3753
3754         ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3755
3756         *ret_left_path = left_path;
3757         ret = 0;
3758 out:
3759         if (ret != 0)
3760                 ocfs2_free_path(left_path);
3761
3762         return ret;
3763 }
3764
3765 static void ocfs2_split_record(struct inode *inode,
3766                                struct ocfs2_path *left_path,
3767                                struct ocfs2_path *right_path,
3768                                struct ocfs2_extent_rec *split_rec,
3769                                enum ocfs2_split_type split)
3770 {
3771         int index;
3772         u32 cpos = le32_to_cpu(split_rec->e_cpos);
3773         struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3774         struct ocfs2_extent_rec *rec, *tmprec;
3775
3776         right_el = path_leaf_el(right_path);;
3777         if (left_path)
3778                 left_el = path_leaf_el(left_path);
3779
3780         el = right_el;
3781         insert_el = right_el;
3782         index = ocfs2_search_extent_list(el, cpos);
3783         if (index != -1) {
3784                 if (index == 0 && left_path) {
3785                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3786
3787                         /*
3788                          * This typically means that the record
3789                          * started in the left path but moved to the
3790                          * right as a result of rotation. We either
3791                          * move the existing record to the left, or we
3792                          * do the later insert there.
3793                          *
3794                          * In this case, the left path should always
3795                          * exist as the rotate code will have passed
3796                          * it back for a post-insert update.
3797                          */
3798
3799                         if (split == SPLIT_LEFT) {
3800                                 /*
3801                                  * It's a left split. Since we know
3802                                  * that the rotate code gave us an
3803                                  * empty extent in the left path, we
3804                                  * can just do the insert there.
3805                                  */
3806                                 insert_el = left_el;
3807                         } else {
3808                                 /*
3809                                  * Right split - we have to move the
3810                                  * existing record over to the left
3811                                  * leaf. The insert will be into the
3812                                  * newly created empty extent in the
3813                                  * right leaf.
3814                                  */
3815                                 tmprec = &right_el->l_recs[index];
3816                                 ocfs2_rotate_leaf(left_el, tmprec);
3817                                 el = left_el;
3818
3819                                 memset(tmprec, 0, sizeof(*tmprec));
3820                                 index = ocfs2_search_extent_list(left_el, cpos);
3821                                 BUG_ON(index == -1);
3822                         }
3823                 }
3824         } else {
3825                 BUG_ON(!left_path);
3826                 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3827                 /*
3828                  * Left path is easy - we can just allow the insert to
3829                  * happen.
3830                  */
3831                 el = left_el;
3832                 insert_el = left_el;
3833                 index = ocfs2_search_extent_list(el, cpos);
3834                 BUG_ON(index == -1);
3835         }
3836
3837         rec = &el->l_recs[index];
3838         ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3839         ocfs2_rotate_leaf(insert_el, split_rec);
3840 }
3841
3842 /*
3843  * This function only does inserts on an allocation b-tree. For tree
3844  * depth = 0, ocfs2_insert_at_leaf() is called directly.
3845  *
3846  * right_path is the path we want to do the actual insert
3847  * in. left_path should only be passed in if we need to update that
3848  * portion of the tree after an edge insert.
3849  */
3850 static int ocfs2_insert_path(struct inode *inode,
3851                              handle_t *handle,
3852                              struct ocfs2_path *left_path,
3853                              struct ocfs2_path *right_path,
3854                              struct ocfs2_extent_rec *insert_rec,
3855                              struct ocfs2_insert_type *insert)
3856 {
3857         int ret, subtree_index;
3858         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3859
3860         if (left_path) {
3861                 int credits = handle->h_buffer_credits;
3862
3863                 /*
3864                  * There's a chance that left_path got passed back to
3865                  * us without being accounted for in the
3866                  * journal. Extend our transaction here to be sure we
3867                  * can change those blocks.
3868                  */
3869                 credits += left_path->p_tree_depth;
3870
3871                 ret = ocfs2_extend_trans(handle, credits);
3872                 if (ret < 0) {
3873                         mlog_errno(ret);
3874                         goto out;
3875                 }
3876
3877                 ret = ocfs2_journal_access_path(inode, handle, left_path);
3878                 if (ret < 0) {
3879                         mlog_errno(ret);
3880                         goto out;
3881                 }
3882         }
3883
3884         /*
3885          * Pass both paths to the journal. The majority of inserts
3886          * will be touching all components anyway.
3887          */
3888         ret = ocfs2_journal_access_path(inode, handle, right_path);
3889         if (ret < 0) {
3890                 mlog_errno(ret);
3891                 goto out;
3892         }
3893
3894         if (insert->ins_split != SPLIT_NONE) {
3895                 /*
3896                  * We could call ocfs2_insert_at_leaf() for some types
3897                  * of splits, but it's easier to just let one separate
3898                  * function sort it all out.
3899                  */
3900                 ocfs2_split_record(inode, left_path, right_path,
3901                                    insert_rec, insert->ins_split);
3902
3903                 /*
3904                  * Split might have modified either leaf and we don't
3905                  * have a guarantee that the later edge insert will
3906                  * dirty this for us.
3907                  */
3908                 if (left_path)
3909                         ret = ocfs2_journal_dirty(handle,
3910                                                   path_leaf_bh(left_path));
3911                         if (ret)
3912                                 mlog_errno(ret);
3913         } else
3914                 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3915                                      insert, inode);
3916
3917         ret = ocfs2_journal_dirty(handle, leaf_bh);
3918         if (ret)
3919                 mlog_errno(ret);
3920
3921         if (left_path) {
3922                 /*
3923                  * The rotate code has indicated that we need to fix
3924                  * up portions of the tree after the insert.
3925                  *
3926                  * XXX: Should we extend the transaction here?
3927                  */
3928                 subtree_index = ocfs2_find_subtree_root(inode, left_path,
3929                                                         right_path);
3930                 ocfs2_complete_edge_insert(inode, handle, left_path,
3931                                            right_path, subtree_index);
3932         }
3933
3934         ret = 0;
3935 out:
3936         return ret;
3937 }
3938
3939 static int ocfs2_do_insert_extent(struct inode *inode,
3940                                   handle_t *handle,
3941                                   struct ocfs2_extent_tree *et,
3942                                   struct ocfs2_extent_rec *insert_rec,
3943                                   struct ocfs2_insert_type *type)
3944 {
3945         int ret, rotate = 0;
3946         u32 cpos;
3947         struct ocfs2_path *right_path = NULL;
3948         struct ocfs2_path *left_path = NULL;
3949         struct ocfs2_extent_list *el;
3950
3951         el = et->et_root_el;
3952
3953         ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
3954                                    OCFS2_JOURNAL_ACCESS_WRITE);
3955         if (ret) {
3956                 mlog_errno(ret);
3957                 goto out;
3958         }
3959
3960         if (le16_to_cpu(el->l_tree_depth) == 0) {
3961                 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
3962                 goto out_update_clusters;
3963         }
3964
3965         right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
3966         if (!right_path) {
3967                 ret = -ENOMEM;
3968                 mlog_errno(ret);
3969                 goto out;
3970         }
3971
3972         /*
3973          * Determine the path to start with. Rotations need the
3974          * rightmost path, everything else can go directly to the
3975          * target leaf.
3976          */
3977         cpos = le32_to_cpu(insert_rec->e_cpos);
3978         if (type->ins_appending == APPEND_NONE &&
3979             type->ins_contig == CONTIG_NONE) {
3980                 rotate = 1;
3981                 cpos = UINT_MAX;
3982         }
3983
3984         ret = ocfs2_find_path(inode, right_path, cpos);
3985         if (ret) {
3986                 mlog_errno(ret);
3987                 goto out;
3988         }
3989
3990         /*
3991          * Rotations and appends need special treatment - they modify
3992          * parts of the tree's above them.
3993          *
3994          * Both might pass back a path immediate to the left of the
3995          * one being inserted to. This will be cause
3996          * ocfs2_insert_path() to modify the rightmost records of
3997          * left_path to account for an edge insert.
3998          *
3999          * XXX: When modifying this code, keep in mind that an insert
4000          * can wind up skipping both of these two special cases...
4001          */
4002         if (rotate) {
4003                 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
4004                                               le32_to_cpu(insert_rec->e_cpos),
4005                                               right_path, &left_path);
4006                 if (ret) {
4007                         mlog_errno(ret);
4008                         goto out;
4009                 }
4010
4011                 /*
4012                  * ocfs2_rotate_tree_right() might have extended the
4013                  * transaction without re-journaling our tree root.
4014                  */
4015                 ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
4016                                            OCFS2_JOURNAL_ACCESS_WRITE);
4017                 if (ret) {
4018                         mlog_errno(ret);
4019                         goto out;
4020                 }
4021         } else if (type->ins_appending == APPEND_TAIL
4022                    && type->ins_contig != CONTIG_LEFT) {
4023                 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
4024                                                right_path, &left_path);
4025                 if (ret) {
4026                         mlog_errno(ret);
4027                         goto out;
4028                 }
4029         }
4030
4031         ret = ocfs2_insert_path(inode, handle, left_path, right_path,
4032                                 insert_rec, type);
4033         if (ret) {
4034                 mlog_errno(ret);
4035                 goto out;
4036         }
4037
4038 out_update_clusters:
4039         if (type->ins_split == SPLIT_NONE)
4040                 ocfs2_et_update_clusters(inode, et,
4041                                          le16_to_cpu(insert_rec->e_leaf_clusters));
4042
4043         ret = ocfs2_journal_dirty(handle, et->et_root_bh);
4044         if (ret)
4045                 mlog_errno(ret);
4046
4047 out:
4048         ocfs2_free_path(left_path);
4049         ocfs2_free_path(right_path);
4050
4051         return ret;
4052 }
4053
4054 static enum ocfs2_contig_type
4055 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4056                                struct ocfs2_extent_list *el, int index,
4057                                struct ocfs2_extent_rec *split_rec)
4058 {
4059         int status;
4060         enum ocfs2_contig_type ret = CONTIG_NONE;
4061         u32 left_cpos, right_cpos;
4062         struct ocfs2_extent_rec *rec = NULL;
4063         struct ocfs2_extent_list *new_el;
4064         struct ocfs2_path *left_path = NULL, *right_path = NULL;
4065         struct buffer_head *bh;
4066         struct ocfs2_extent_block *eb;
4067
4068         if (index > 0) {
4069                 rec = &el->l_recs[index - 1];
4070         } else if (path->p_tree_depth > 0) {
4071                 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
4072                                                        path, &left_cpos);
4073                 if (status)
4074                         goto out;
4075
4076                 if (left_cpos != 0) {
4077                         left_path = ocfs2_new_path(path_root_bh(path),
4078                                                    path_root_el(path));
4079                         if (!left_path)
4080                                 goto out;
4081
4082                         status = ocfs2_find_path(inode, left_path, left_cpos);
4083                         if (status)
4084                                 goto out;
4085
4086                         new_el = path_leaf_el(left_path);
4087
4088                         if (le16_to_cpu(new_el->l_next_free_rec) !=
4089                             le16_to_cpu(new_el->l_count)) {
4090                                 bh = path_leaf_bh(left_path);
4091                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4092                                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
4093                                                                  eb);
4094                                 goto out;
4095                         }
4096                         rec = &new_el->l_recs[
4097                                 le16_to_cpu(new_el->l_next_free_rec) - 1];
4098                 }
4099         }
4100
4101         /*
4102          * We're careful to check for an empty extent record here -
4103          * the merge code will know what to do if it sees one.
4104          */
4105         if (rec) {
4106                 if (index == 1 && ocfs2_is_empty_extent(rec)) {
4107                         if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4108                                 ret = CONTIG_RIGHT;
4109                 } else {
4110                         ret = ocfs2_extent_contig(inode, rec, split_rec);
4111                 }
4112         }
4113
4114         rec = NULL;
4115         if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4116                 rec = &el->l_recs[index + 1];
4117         else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4118                  path->p_tree_depth > 0) {
4119                 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
4120                                                         path, &right_cpos);
4121                 if (status)
4122                         goto out;
4123
4124                 if (right_cpos == 0)
4125                         goto out;
4126
4127                 right_path = ocfs2_new_path(path_root_bh(path),
4128                                             path_root_el(path));
4129                 if (!right_path)
4130                         goto out;
4131
4132                 status = ocfs2_find_path(inode, right_path, right_cpos);
4133                 if (status)
4134                         goto out;
4135
4136                 new_el = path_leaf_el(right_path);
4137                 rec = &new_el->l_recs[0];
4138                 if (ocfs2_is_empty_extent(rec)) {
4139                         if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4140                                 bh = path_leaf_bh(right_path);
4141                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4142                                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
4143                                                                  eb);
4144                                 goto out;
4145                         }
4146                         rec = &new_el->l_recs[1];
4147                 }
4148         }
4149
4150         if (rec) {
4151                 enum ocfs2_contig_type contig_type;
4152
4153                 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
4154
4155                 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4156                         ret = CONTIG_LEFTRIGHT;
4157                 else if (ret == CONTIG_NONE)
4158                         ret = contig_type;
4159         }
4160
4161 out:
4162         if (left_path)
4163                 ocfs2_free_path(left_path);
4164         if (right_path)
4165                 ocfs2_free_path(right_path);
4166
4167         return ret;
4168 }
4169
4170 static void ocfs2_figure_contig_type(struct inode *inode,
4171                                      struct ocfs2_insert_type *insert,
4172                                      struct ocfs2_extent_list *el,
4173                                      struct ocfs2_extent_rec *insert_rec,
4174                                      struct ocfs2_extent_tree *et)
4175 {
4176         int i;
4177         enum ocfs2_contig_type contig_type = CONTIG_NONE;
4178
4179         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4180
4181         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4182                 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
4183                                                   insert_rec);
4184                 if (contig_type != CONTIG_NONE) {
4185                         insert->ins_contig_index = i;
4186                         break;
4187                 }
4188         }
4189         insert->ins_contig = contig_type;
4190
4191         if (insert->ins_contig != CONTIG_NONE) {
4192                 struct ocfs2_extent_rec *rec =
4193                                 &el->l_recs[insert->ins_contig_index];
4194                 unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4195                                    le16_to_cpu(insert_rec->e_leaf_clusters);
4196
4197                 /*
4198                  * Caller might want us to limit the size of extents, don't
4199                  * calculate contiguousness if we might exceed that limit.
4200                  */
4201                 if (et->et_max_leaf_clusters &&
4202                     (len > et->et_max_leaf_clusters))
4203                         insert->ins_contig = CONTIG_NONE;
4204         }
4205 }
4206
4207 /*
4208  * This should only be called against the righmost leaf extent list.
4209  *
4210  * ocfs2_figure_appending_type() will figure out whether we'll have to
4211  * insert at the tail of the rightmost leaf.
4212  *
4213  * This should also work against the root extent list for tree's with 0
4214  * depth. If we consider the root extent list to be the rightmost leaf node
4215  * then the logic here makes sense.
4216  */
4217 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4218                                         struct ocfs2_extent_list *el,
4219                                         struct ocfs2_extent_rec *insert_rec)
4220 {
4221         int i;
4222         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4223         struct ocfs2_extent_rec *rec;
4224
4225         insert->ins_appending = APPEND_NONE;
4226
4227         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4228
4229         if (!el->l_next_free_rec)
4230                 goto set_tail_append;
4231
4232         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4233                 /* Were all records empty? */
4234                 if (le16_to_cpu(el->l_next_free_rec) == 1)
4235                         goto set_tail_append;
4236         }
4237
4238         i = le16_to_cpu(el->l_next_free_rec) - 1;
4239         rec = &el->l_recs[i];
4240
4241         if (cpos >=
4242             (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4243                 goto set_tail_append;
4244
4245         return;
4246
4247 set_tail_append:
4248         insert->ins_appending = APPEND_TAIL;
4249 }
4250
4251 /*
4252  * Helper function called at the begining of an insert.
4253  *
4254  * This computes a few things that are commonly used in the process of
4255  * inserting into the btree:
4256  *   - Whether the new extent is contiguous with an existing one.
4257  *   - The current tree depth.
4258  *   - Whether the insert is an appending one.
4259  *   - The total # of free records in the tree.
4260  *
4261  * All of the information is stored on the ocfs2_insert_type
4262  * structure.
4263  */
4264 static int ocfs2_figure_insert_type(struct inode *inode,
4265                                     struct ocfs2_extent_tree *et,
4266                                     struct buffer_head **last_eb_bh,
4267                                     struct ocfs2_extent_rec *insert_rec,
4268                                     int *free_records,
4269                                     struct ocfs2_insert_type *insert)
4270 {
4271         int ret;
4272         struct ocfs2_extent_block *eb;
4273         struct ocfs2_extent_list *el;
4274         struct ocfs2_path *path = NULL;
4275         struct buffer_head *bh = NULL;
4276
4277         insert->ins_split = SPLIT_NONE;
4278
4279         el = et->et_root_el;
4280         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4281
4282         if (el->l_tree_depth) {
4283                 /*
4284                  * If we have tree depth, we read in the
4285                  * rightmost extent block ahead of time as
4286                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
4287                  * may want it later.
4288                  */
4289                 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
4290                 if (ret) {
4291                         mlog_exit(ret);
4292                         goto out;
4293                 }
4294                 eb = (struct ocfs2_extent_block *) bh->b_data;
4295                 el = &eb->h_list;
4296         }
4297
4298         /*
4299          * Unless we have a contiguous insert, we'll need to know if
4300          * there is room left in our allocation tree for another
4301          * extent record.
4302          *
4303          * XXX: This test is simplistic, we can search for empty
4304          * extent records too.
4305          */
4306         *free_records = le16_to_cpu(el->l_count) -
4307                 le16_to_cpu(el->l_next_free_rec);
4308
4309         if (!insert->ins_tree_depth) {
4310                 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4311                 ocfs2_figure_appending_type(insert, el, insert_rec);
4312                 return 0;
4313         }
4314
4315         path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4316         if (!path) {
4317                 ret = -ENOMEM;
4318                 mlog_errno(ret);
4319                 goto out;
4320         }
4321
4322         /*
4323          * In the case that we're inserting past what the tree
4324          * currently accounts for, ocfs2_find_path() will return for
4325          * us the rightmost tree path. This is accounted for below in
4326          * the appending code.
4327          */
4328         ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4329         if (ret) {
4330                 mlog_errno(ret);
4331                 goto out;
4332         }
4333
4334         el = path_leaf_el(path);
4335
4336         /*
4337          * Now that we have the path, there's two things we want to determine:
4338          * 1) Contiguousness (also set contig_index if this is so)
4339          *
4340          * 2) Are we doing an append? We can trivially break this up
4341          *     into two types of appends: simple record append, or a
4342          *     rotate inside the tail leaf.
4343          */
4344         ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4345
4346         /*
4347          * The insert code isn't quite ready to deal with all cases of
4348          * left contiguousness. Specifically, if it's an insert into
4349          * the 1st record in a leaf, it will require the adjustment of
4350          * cluster count on the last record of the path directly to it's
4351          * left. For now, just catch that case and fool the layers
4352          * above us. This works just fine for tree_depth == 0, which
4353          * is why we allow that above.
4354          */
4355         if (insert->ins_contig == CONTIG_LEFT &&
4356             insert->ins_contig_index == 0)
4357                 insert->ins_contig = CONTIG_NONE;
4358
4359         /*
4360          * Ok, so we can simply compare against last_eb to figure out
4361          * whether the path doesn't exist. This will only happen in
4362          * the case that we're doing a tail append, so maybe we can
4363          * take advantage of that information somehow.
4364          */
4365         if (ocfs2_et_get_last_eb_blk(et) ==
4366             path_leaf_bh(path)->b_blocknr) {
4367                 /*
4368                  * Ok, ocfs2_find_path() returned us the rightmost
4369                  * tree path. This might be an appending insert. There are
4370                  * two cases:
4371                  *    1) We're doing a true append at the tail:
4372                  *      -This might even be off the end of the leaf
4373                  *    2) We're "appending" by rotating in the tail
4374                  */
4375                 ocfs2_figure_appending_type(insert, el, insert_rec);
4376         }
4377
4378 out:
4379         ocfs2_free_path(path);
4380
4381         if (ret == 0)
4382                 *last_eb_bh = bh;
4383         else
4384                 brelse(bh);
4385         return ret;
4386 }
4387
4388 /*
4389  * Insert an extent into an inode btree.
4390  *
4391  * The caller needs to update fe->i_clusters
4392  */
4393 int ocfs2_insert_extent(struct ocfs2_super *osb,
4394                         handle_t *handle,
4395                         struct inode *inode,
4396                         struct ocfs2_extent_tree *et,
4397                         u32 cpos,
4398                         u64 start_blk,
4399                         u32 new_clusters,
4400                         u8 flags,
4401                         struct ocfs2_alloc_context *meta_ac)
4402 {
4403         int status;
4404         int uninitialized_var(free_records);
4405         struct buffer_head *last_eb_bh = NULL;
4406         struct ocfs2_insert_type insert = {0, };
4407         struct ocfs2_extent_rec rec;
4408
4409         mlog(0, "add %u clusters at position %u to inode %llu\n",
4410              new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4411
4412         memset(&rec, 0, sizeof(rec));
4413         rec.e_cpos = cpu_to_le32(cpos);
4414         rec.e_blkno = cpu_to_le64(start_blk);
4415         rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4416         rec.e_flags = flags;
4417         status = ocfs2_et_insert_check(inode, et, &rec);
4418         if (status) {
4419                 mlog_errno(status);
4420                 goto bail;
4421         }
4422
4423         status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
4424                                           &free_records, &insert);
4425         if (status < 0) {
4426                 mlog_errno(status);
4427                 goto bail;
4428         }
4429
4430         mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4431              "Insert.contig_index: %d, Insert.free_records: %d, "
4432              "Insert.tree_depth: %d\n",
4433              insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4434              free_records, insert.ins_tree_depth);
4435
4436         if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4437                 status = ocfs2_grow_tree(inode, handle, et,
4438                                          &insert.ins_tree_depth, &last_eb_bh,
4439                                          meta_ac);
4440                 if (status) {
4441                         mlog_errno(status);
4442                         goto bail;
4443                 }
4444         }
4445
4446         /* Finally, we can add clusters. This might rotate the tree for us. */
4447         status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
4448         if (status < 0)
4449                 mlog_errno(status);
4450         else if (et->et_ops == &ocfs2_dinode_et_ops)
4451                 ocfs2_extent_map_insert_rec(inode, &rec);
4452
4453 bail:
4454         brelse(last_eb_bh);
4455
4456         mlog_exit(status);
4457         return status;
4458 }
4459
4460 /*
4461  * Allcate and add clusters into the extent b-tree.
4462  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4463  * The extent b-tree's root is specified by et, and
4464  * it is not limited to the file storage. Any extent tree can use this
4465  * function if it implements the proper ocfs2_extent_tree.
4466  */
4467 int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4468                                 struct inode *inode,
4469                                 u32 *logical_offset,
4470                                 u32 clusters_to_add,
4471                                 int mark_unwritten,
4472                                 struct ocfs2_extent_tree *et,
4473                                 handle_t *handle,
4474                                 struct ocfs2_alloc_context *data_ac,
4475                                 struct ocfs2_alloc_context *meta_ac,
4476                                 enum ocfs2_alloc_restarted *reason_ret)
4477 {
4478         int status = 0;
4479         int free_extents;
4480         enum ocfs2_alloc_restarted reason = RESTART_NONE;
4481         u32 bit_off, num_bits;
4482         u64 block;
4483         u8 flags = 0;
4484
4485         BUG_ON(!clusters_to_add);
4486
4487         if (mark_unwritten)
4488                 flags = OCFS2_EXT_UNWRITTEN;
4489
4490         free_extents = ocfs2_num_free_extents(osb, inode, et);
4491         if (free_extents < 0) {
4492                 status = free_extents;
4493                 mlog_errno(status);
4494                 goto leave;
4495         }
4496
4497         /* there are two cases which could cause us to EAGAIN in the
4498          * we-need-more-metadata case:
4499          * 1) we haven't reserved *any*
4500          * 2) we are so fragmented, we've needed to add metadata too
4501          *    many times. */
4502         if (!free_extents && !meta_ac) {
4503                 mlog(0, "we haven't reserved any metadata!\n");
4504                 status = -EAGAIN;
4505                 reason = RESTART_META;
4506                 goto leave;
4507         } else if ((!free_extents)
4508                    && (ocfs2_alloc_context_bits_left(meta_ac)
4509                        < ocfs2_extend_meta_needed(et->et_root_el))) {
4510                 mlog(0, "filesystem is really fragmented...\n");
4511                 status = -EAGAIN;
4512                 reason = RESTART_META;
4513                 goto leave;
4514         }
4515
4516         status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4517                                         clusters_to_add, &bit_off, &num_bits);
4518         if (status < 0) {
4519                 if (status != -ENOSPC)
4520                         mlog_errno(status);
4521                 goto leave;
4522         }
4523
4524         BUG_ON(num_bits > clusters_to_add);
4525
4526         /* reserve our write early -- insert_extent may update the inode */
4527         status = ocfs2_journal_access(handle, inode, et->et_root_bh,
4528                                       OCFS2_JOURNAL_ACCESS_WRITE);
4529         if (status < 0) {
4530                 mlog_errno(status);
4531                 goto leave;
4532         }
4533
4534         block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4535         mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
4536              num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4537         status = ocfs2_insert_extent(osb, handle, inode, et,
4538                                      *logical_offset, block,
4539                                      num_bits, flags, meta_ac);
4540         if (status < 0) {
4541                 mlog_errno(status);
4542                 goto leave;
4543         }
4544
4545         status = ocfs2_journal_dirty(handle, et->et_root_bh);
4546         if (status < 0) {
4547                 mlog_errno(status);
4548                 goto leave;
4549         }
4550
4551         clusters_to_add -= num_bits;
4552         *logical_offset += num_bits;
4553
4554         if (clusters_to_add) {
4555                 mlog(0, "need to alloc once more, wanted = %u\n",
4556                      clusters_to_add);
4557                 status = -EAGAIN;
4558                 reason = RESTART_TRANS;
4559         }
4560
4561 leave:
4562         mlog_exit(status);
4563         if (reason_ret)
4564                 *reason_ret = reason;
4565         return status;
4566 }
4567
4568 static void ocfs2_make_right_split_rec(struct super_block *sb,
4569                                        struct ocfs2_extent_rec *split_rec,
4570                                        u32 cpos,
4571                                        struct ocfs2_extent_rec *rec)
4572 {
4573         u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4574         u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4575
4576         memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4577
4578         split_rec->e_cpos = cpu_to_le32(cpos);
4579         split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4580
4581         split_rec->e_blkno = rec->e_blkno;
4582         le64_add_cpu(&split_rec->e_blkno,
4583                      ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4584
4585         split_rec->e_flags = rec->e_flags;
4586 }
4587
4588 static int ocfs2_split_and_insert(struct inode *inode,
4589                                   handle_t *handle,
4590                                   struct ocfs2_path *path,
4591                                   struct ocfs2_extent_tree *et,
4592                                   struct buffer_head **last_eb_bh,
4593                                   int split_index,
4594                                   struct ocfs2_extent_rec *orig_split_rec,
4595                                   struct ocfs2_alloc_context *meta_ac)
4596 {
4597         int ret = 0, depth;
4598         unsigned int insert_range, rec_range, do_leftright = 0;
4599         struct ocfs2_extent_rec tmprec;
4600         struct ocfs2_extent_list *rightmost_el;
4601         struct ocfs2_extent_rec rec;
4602         struct ocfs2_extent_rec split_rec = *orig_split_rec;
4603         struct ocfs2_insert_type insert;
4604         struct ocfs2_extent_block *eb;
4605
4606 leftright:
4607         /*
4608          * Store a copy of the record on the stack - it might move
4609          * around as the tree is manipulated below.
4610          */
4611         rec = path_leaf_el(path)->l_recs[split_index];
4612
4613         rightmost_el = et->et_root_el;
4614
4615         depth = le16_to_cpu(rightmost_el->l_tree_depth);
4616         if (depth) {
4617                 BUG_ON(!(*last_eb_bh));
4618                 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4619                 rightmost_el = &eb->h_list;
4620         }
4621
4622         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4623             le16_to_cpu(rightmost_el->l_count)) {
4624                 ret = ocfs2_grow_tree(inode, handle, et,
4625                                       &depth, last_eb_bh, meta_ac);
4626                 if (ret) {
4627                         mlog_errno(ret);
4628                         goto out;
4629                 }
4630         }
4631
4632         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4633         insert.ins_appending = APPEND_NONE;
4634         insert.ins_contig = CONTIG_NONE;
4635         insert.ins_tree_depth = depth;
4636
4637         insert_range = le32_to_cpu(split_rec.e_cpos) +
4638                 le16_to_cpu(split_rec.e_leaf_clusters);
4639         rec_range = le32_to_cpu(rec.e_cpos) +
4640                 le16_to_cpu(rec.e_leaf_clusters);
4641
4642         if (split_rec.e_cpos == rec.e_cpos) {
4643                 insert.ins_split = SPLIT_LEFT;
4644         } else if (insert_range == rec_range) {
4645                 insert.ins_split = SPLIT_RIGHT;
4646         } else {
4647                 /*
4648                  * Left/right split. We fake this as a right split
4649                  * first and then make a second pass as a left split.
4650                  */
4651                 insert.ins_split = SPLIT_RIGHT;
4652
4653                 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4654                                            &rec);
4655
4656                 split_rec = tmprec;
4657
4658                 BUG_ON(do_leftright);
4659                 do_leftright = 1;
4660         }
4661
4662         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4663         if (ret) {
4664                 mlog_errno(ret);
4665                 goto out;
4666         }
4667
4668         if (do_leftright == 1) {
4669                 u32 cpos;
4670                 struct ocfs2_extent_list *el;
4671
4672                 do_leftright++;
4673                 split_rec = *orig_split_rec;
4674
4675                 ocfs2_reinit_path(path, 1);
4676
4677                 cpos = le32_to_cpu(split_rec.e_cpos);
4678                 ret = ocfs2_find_path(inode, path, cpos);
4679                 if (ret) {
4680                         mlog_errno(ret);
4681                         goto out;
4682                 }
4683
4684                 el = path_leaf_el(path);
4685                 split_index = ocfs2_search_extent_list(el, cpos);
4686                 goto leftright;
4687         }
4688 out:
4689
4690         return ret;
4691 }
4692
4693 /*
4694  * Mark part or all of the extent record at split_index in the leaf
4695  * pointed to by path as written. This removes the unwritten
4696  * extent flag.
4697  *
4698  * Care is taken to handle contiguousness so as to not grow the tree.
4699  *
4700  * meta_ac is not strictly necessary - we only truly need it if growth
4701  * of the tree is required. All other cases will degrade into a less
4702  * optimal tree layout.
4703  *
4704  * last_eb_bh should be the rightmost leaf block for any extent
4705  * btree. Since a split may grow the tree or a merge might shrink it,
4706  * the caller cannot trust the contents of that buffer after this call.
4707  *
4708  * This code is optimized for readability - several passes might be
4709  * made over certain portions of the tree. All of those blocks will
4710  * have been brought into cache (and pinned via the journal), so the
4711  * extra overhead is not expressed in terms of disk reads.
4712  */
4713 static int __ocfs2_mark_extent_written(struct inode *inode,
4714                                        struct ocfs2_extent_tree *et,
4715                                        handle_t *handle,
4716                                        struct ocfs2_path *path,
4717                                        int split_index,
4718                                        struct ocfs2_extent_rec *split_rec,
4719                                        struct ocfs2_alloc_context *meta_ac,
4720                                        struct ocfs2_cached_dealloc_ctxt *dealloc)
4721 {
4722         int ret = 0;
4723         struct ocfs2_extent_list *el = path_leaf_el(path);
4724         struct buffer_head *last_eb_bh = NULL;
4725         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
4726         struct ocfs2_merge_ctxt ctxt;
4727         struct ocfs2_extent_list *rightmost_el;
4728
4729         if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
4730                 ret = -EIO;
4731                 mlog_errno(ret);
4732                 goto out;
4733         }
4734
4735         if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
4736             ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
4737              (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
4738                 ret = -EIO;
4739                 mlog_errno(ret);
4740                 goto out;
4741         }
4742
4743         ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
4744                                                             split_index,
4745                                                             split_rec);
4746
4747         /*
4748          * The core merge / split code wants to know how much room is
4749          * left in this inodes allocation tree, so we pass the
4750          * rightmost extent list.
4751          */
4752         if (path->p_tree_depth) {
4753                 struct ocfs2_extent_block *eb;
4754
4755                 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
4756                                        &last_eb_bh);
4757                 if (ret) {
4758                         mlog_exit(ret);
4759                         goto out;
4760                 }
4761
4762                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4763                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4764                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4765                         ret = -EROFS;
4766                         goto out;
4767                 }
4768
4769                 rightmost_el = &eb->h_list;
4770         } else
4771                 rightmost_el = path_root_el(path);
4772
4773         if (rec->e_cpos == split_rec->e_cpos &&
4774             rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4775                 ctxt.c_split_covers_rec = 1;
4776         else
4777                 ctxt.c_split_covers_rec = 0;
4778
4779         ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4780
4781         mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4782              split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4783              ctxt.c_split_covers_rec);
4784
4785         if (ctxt.c_contig_type == CONTIG_NONE) {
4786                 if (ctxt.c_split_covers_rec)
4787                         el->l_recs[split_index] = *split_rec;
4788                 else
4789                         ret = ocfs2_split_and_insert(inode, handle, path, et,
4790                                                      &last_eb_bh, split_index,
4791                                                      split_rec, meta_ac);
4792                 if (ret)
4793                         mlog_errno(ret);
4794         } else {
4795                 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4796                                                 split_index, split_rec,
4797                                                 dealloc, &ctxt, et);
4798                 if (ret)
4799                         mlog_errno(ret);
4800         }
4801
4802 out:
4803         brelse(last_eb_bh);
4804         return ret;
4805 }
4806
4807 /*
4808  * Mark the already-existing extent at cpos as written for len clusters.
4809  *
4810  * If the existing extent is larger than the request, initiate a
4811  * split. An attempt will be made at merging with adjacent extents.
4812  *
4813  * The caller is responsible for passing down meta_ac if we'll need it.
4814  */
4815 int ocfs2_mark_extent_written(struct inode *inode,
4816                               struct ocfs2_extent_tree *et,
4817                               handle_t *handle, u32 cpos, u32 len, u32 phys,
4818                               struct ocfs2_alloc_context *meta_ac,
4819                               struct ocfs2_cached_dealloc_ctxt *dealloc)
4820 {
4821         int ret, index;
4822         u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4823         struct ocfs2_extent_rec split_rec;
4824         struct ocfs2_path *left_path = NULL;
4825         struct ocfs2_extent_list *el;
4826
4827         mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4828              inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4829
4830         if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4831                 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4832                             "that are being written to, but the feature bit "
4833                             "is not set in the super block.",
4834                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
4835                 ret = -EROFS;
4836                 goto out;
4837         }
4838
4839         /*
4840          * XXX: This should be fixed up so that we just re-insert the
4841          * next extent records.
4842          *
4843          * XXX: This is a hack on the extent tree, maybe it should be
4844          * an op?
4845          */
4846         if (et->et_ops == &ocfs2_dinode_et_ops)
4847                 ocfs2_extent_map_trunc(inode, 0);
4848
4849         left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4850         if (!left_path) {
4851                 ret = -ENOMEM;
4852                 mlog_errno(ret);
4853                 goto out;
4854         }
4855
4856         ret = ocfs2_find_path(inode, left_path, cpos);
4857         if (ret) {
4858                 mlog_errno(ret);
4859                 goto out;
4860         }
4861         el = path_leaf_el(left_path);
4862
4863         index = ocfs2_search_extent_list(el, cpos);
4864         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4865                 ocfs2_error(inode->i_sb,
4866                             "Inode %llu has an extent at cpos %u which can no "
4867                             "longer be found.\n",
4868                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4869                 ret = -EROFS;
4870                 goto out;
4871         }
4872
4873         memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4874         split_rec.e_cpos = cpu_to_le32(cpos);
4875         split_rec.e_leaf_clusters = cpu_to_le16(len);
4876         split_rec.e_blkno = cpu_to_le64(start_blkno);
4877         split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4878         split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4879
4880         ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
4881                                           index, &split_rec, meta_ac,
4882                                           dealloc);
4883         if (ret)
4884                 mlog_errno(ret);
4885
4886 out:
4887         ocfs2_free_path(left_path);
4888         return ret;
4889 }
4890
4891 static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
4892                             handle_t *handle, struct ocfs2_path *path,
4893                             int index, u32 new_range,
4894                             struct ocfs2_alloc_context *meta_ac)
4895 {
4896         int ret, depth, credits = handle->h_buffer_credits;
4897         struct buffer_head *last_eb_bh = NULL;
4898         struct ocfs2_extent_block *eb;
4899         struct ocfs2_extent_list *rightmost_el, *el;
4900         struct ocfs2_extent_rec split_rec;
4901         struct ocfs2_extent_rec *rec;
4902         struct ocfs2_insert_type insert;
4903
4904         /*
4905          * Setup the record to split before we grow the tree.
4906          */
4907         el = path_leaf_el(path);
4908         rec = &el->l_recs[index];
4909         ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4910
4911         depth = path->p_tree_depth;
4912         if (depth > 0) {
4913                 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
4914                                        &last_eb_bh);
4915                 if (ret < 0) {
4916                         mlog_errno(ret);
4917                         goto out;
4918                 }
4919
4920                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4921                 rightmost_el = &eb->h_list;
4922         } else
4923                 rightmost_el = path_leaf_el(path);
4924
4925         credits += path->p_tree_depth +
4926                    ocfs2_extend_meta_needed(et->et_root_el);
4927         ret = ocfs2_extend_trans(handle, credits);
4928         if (ret) {
4929                 mlog_errno(ret);
4930                 goto out;
4931         }
4932
4933         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4934             le16_to_cpu(rightmost_el->l_count)) {
4935                 ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
4936                                       meta_ac);
4937                 if (ret) {
4938                         mlog_errno(ret);
4939                         goto out;
4940                 }
4941         }
4942
4943         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4944         insert.ins_appending = APPEND_NONE;
4945         insert.ins_contig = CONTIG_NONE;
4946         insert.ins_split = SPLIT_RIGHT;
4947         insert.ins_tree_depth = depth;
4948
4949         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4950         if (ret)
4951                 mlog_errno(ret);
4952
4953 out:
4954         brelse(last_eb_bh);
4955         return ret;
4956 }
4957
4958 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4959                               struct ocfs2_path *path, int index,
4960                               struct ocfs2_cached_dealloc_ctxt *dealloc,
4961                               u32 cpos, u32 len,
4962                               struct ocfs2_extent_tree *et)
4963 {
4964         int ret;
4965         u32 left_cpos, rec_range, trunc_range;
4966         int wants_rotate = 0, is_rightmost_tree_rec = 0;
4967         struct super_block *sb = inode->i_sb;
4968         struct ocfs2_path *left_path = NULL;
4969         struct ocfs2_extent_list *el = path_leaf_el(path);
4970         struct ocfs2_extent_rec *rec;
4971         struct ocfs2_extent_block *eb;
4972
4973         if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
4974                 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
4975                 if (ret) {
4976                         mlog_errno(ret);
4977                         goto out;
4978                 }
4979
4980                 index--;
4981         }
4982
4983         if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
4984             path->p_tree_depth) {
4985                 /*
4986                  * Check whether this is the rightmost tree record. If
4987                  * we remove all of this record or part of its right
4988                  * edge then an update of the record lengths above it
4989                  * will be required.
4990                  */
4991                 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
4992                 if (eb->h_next_leaf_blk == 0)
4993                         is_rightmost_tree_rec = 1;
4994         }
4995
4996         rec = &el->l_recs[index];
4997         if (index == 0 && path->p_tree_depth &&
4998             le32_to_cpu(rec->e_cpos) == cpos) {
4999                 /*
5000                  * Changing the leftmost offset (via partial or whole
5001                  * record truncate) of an interior (or rightmost) path
5002                  * means we have to update the subtree that is formed
5003                  * by this leaf and the one to it's left.
5004                  *
5005                  * There are two cases we can skip:
5006                  *   1) Path is the leftmost one in our inode tree.
5007                  *   2) The leaf is rightmost and will be empty after
5008                  *      we remove the extent record - the rotate code
5009                  *      knows how to update the newly formed edge.
5010                  */
5011
5012                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
5013                                                     &left_cpos);
5014                 if (ret) {
5015                         mlog_errno(ret);
5016                         goto out;
5017                 }
5018
5019                 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5020                         left_path = ocfs2_new_path(path_root_bh(path),
5021                                                    path_root_el(path));
5022                         if (!left_path) {
5023                                 ret = -ENOMEM;
5024                                 mlog_errno(ret);
5025                                 goto out;
5026                         }
5027
5028                         ret = ocfs2_find_path(inode, left_path, left_cpos);
5029                         if (ret) {
5030                                 mlog_errno(ret);
5031                                 goto out;
5032                         }
5033                 }
5034         }
5035
5036         ret = ocfs2_extend_rotate_transaction(handle, 0,
5037                                               handle->h_buffer_credits,
5038                                               path);
5039         if (ret) {
5040                 mlog_errno(ret);
5041                 goto out;
5042         }
5043
5044         ret = ocfs2_journal_access_path(inode, handle, path);
5045         if (ret) {
5046                 mlog_errno(ret);
5047                 goto out;
5048         }
5049
5050         ret = ocfs2_journal_access_path(inode, handle, left_path);
5051         if (ret) {
5052                 mlog_errno(ret);
5053                 goto out;
5054         }
5055
5056         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5057         trunc_range = cpos + len;
5058
5059         if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5060                 int next_free;
5061
5062                 memset(rec, 0, sizeof(*rec));
5063                 ocfs2_cleanup_merge(el, index);
5064                 wants_rotate = 1;
5065
5066                 next_free = le16_to_cpu(el->l_next_free_rec);
5067                 if (is_rightmost_tree_rec && next_free > 1) {
5068                         /*
5069                          * We skip the edge update if this path will
5070                          * be deleted by the rotate code.
5071                          */
5072                         rec = &el->l_recs[next_free - 1];
5073                         ocfs2_adjust_rightmost_records(inode, handle, path,
5074                                                        rec);
5075                 }
5076         } else if (le32_to_cpu(rec->e_cpos) == cpos) {
5077                 /* Remove leftmost portion of the record. */
5078                 le32_add_cpu(&rec->e_cpos, len);
5079                 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5080                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5081         } else if (rec_range == trunc_range) {
5082                 /* Remove rightmost portion of the record */
5083                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5084                 if (is_rightmost_tree_rec)
5085                         ocfs2_adjust_rightmost_records(inode, handle, path, rec);
5086         } else {
5087                 /* Caller should have trapped this. */
5088                 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
5089                      "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
5090                      le32_to_cpu(rec->e_cpos),
5091                      le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5092                 BUG();
5093         }
5094
5095         if (left_path) {
5096                 int subtree_index;
5097
5098                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
5099                 ocfs2_complete_edge_insert(inode, handle, left_path, path,
5100                                            subtree_index);
5101         }
5102
5103         ocfs2_journal_dirty(handle, path_leaf_bh(path));
5104
5105         ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5106         if (ret) {
5107                 mlog_errno(ret);
5108                 goto out;
5109         }
5110
5111 out:
5112         ocfs2_free_path(left_path);
5113         return ret;
5114 }
5115
5116 int ocfs2_remove_extent(struct inode *inode,
5117                         struct ocfs2_extent_tree *et,
5118                         u32 cpos, u32 len, handle_t *handle,
5119                         struct ocfs2_alloc_context *meta_ac,
5120                         struct ocfs2_cached_dealloc_ctxt *dealloc)
5121 {
5122         int ret, index;
5123         u32 rec_range, trunc_range;
5124         struct ocfs2_extent_rec *rec;
5125         struct ocfs2_extent_list *el;
5126         struct ocfs2_path *path = NULL;
5127
5128         ocfs2_extent_map_trunc(inode, 0);
5129
5130         path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
5131         if (!path) {
5132                 ret = -ENOMEM;
5133                 mlog_errno(ret);
5134                 goto out;
5135         }
5136
5137         ret = ocfs2_find_path(inode, path, cpos);
5138         if (ret) {
5139                 mlog_errno(ret);
5140                 goto out;
5141         }
5142
5143         el = path_leaf_el(path);
5144         index = ocfs2_search_extent_list(el, cpos);
5145         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5146                 ocfs2_error(inode->i_sb,
5147                             "Inode %llu has an extent at cpos %u which can no "
5148                             "longer be found.\n",
5149                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
5150                 ret = -EROFS;
5151                 goto out;
5152         }
5153
5154         /*
5155          * We have 3 cases of extent removal:
5156          *   1) Range covers the entire extent rec
5157          *   2) Range begins or ends on one edge of the extent rec
5158          *   3) Range is in the middle of the extent rec (no shared edges)
5159          *
5160          * For case 1 we remove the extent rec and left rotate to
5161          * fill the hole.
5162          *
5163          * For case 2 we just shrink the existing extent rec, with a
5164          * tree update if the shrinking edge is also the edge of an
5165          * extent block.
5166          *
5167          * For case 3 we do a right split to turn the extent rec into
5168          * something case 2 can handle.
5169          */
5170         rec = &el->l_recs[index];
5171         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5172         trunc_range = cpos + len;
5173
5174         BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5175
5176         mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
5177              "(cpos %u, len %u)\n",
5178              (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
5179              le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5180
5181         if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5182                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5183                                          cpos, len, et);
5184                 if (ret) {
5185                         mlog_errno(ret);
5186                         goto out;
5187                 }
5188         } else {
5189                 ret = ocfs2_split_tree(inode, et, handle, path, index,
5190                                        trunc_range, meta_ac);
5191                 if (ret) {
5192                         mlog_errno(ret);
5193                         goto out;
5194                 }
5195
5196                 /*
5197                  * The split could have manipulated the tree enough to
5198                  * move the record location, so we have to look for it again.
5199                  */
5200                 ocfs2_reinit_path(path, 1);
5201
5202                 ret = ocfs2_find_path(inode, path, cpos);
5203                 if (ret) {
5204                         mlog_errno(ret);
5205                         goto out;
5206                 }
5207
5208                 el = path_leaf_el(path);
5209                 index = ocfs2_search_extent_list(el, cpos);
5210                 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5211                         ocfs2_error(inode->i_sb,
5212                                     "Inode %llu: split at cpos %u lost record.",
5213                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5214                                     cpos);
5215                         ret = -EROFS;
5216                         goto out;
5217                 }
5218
5219                 /*
5220                  * Double check our values here. If anything is fishy,
5221                  * it's easier to catch it at the top level.
5222                  */
5223                 rec = &el->l_recs[index];
5224                 rec_range = le32_to_cpu(rec->e_cpos) +
5225                         ocfs2_rec_clusters(el, rec);
5226                 if (rec_range != trunc_range) {
5227                         ocfs2_error(inode->i_sb,
5228                                     "Inode %llu: error after split at cpos %u"
5229                                     "trunc len %u, existing record is (%u,%u)",
5230                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5231                                     cpos, len, le32_to_cpu(rec->e_cpos),
5232                                     ocfs2_rec_clusters(el, rec));
5233                         ret = -EROFS;
5234                         goto out;
5235                 }
5236
5237                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5238                                          cpos, len, et);
5239                 if (ret) {
5240                         mlog_errno(ret);
5241                         goto out;
5242                 }
5243         }
5244
5245 out:
5246         ocfs2_free_path(path);
5247         return ret;
5248 }
5249
5250 int ocfs2_remove_btree_range(struct inode *inode,
5251                              struct ocfs2_extent_tree *et,
5252                              u32 cpos, u32 phys_cpos, u32 len,
5253                              struct ocfs2_cached_dealloc_ctxt *dealloc)
5254 {
5255         int ret;
5256         u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5257         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5258         struct inode *tl_inode = osb->osb_tl_inode;
5259         handle_t *handle;
5260         struct ocfs2_alloc_context *meta_ac = NULL;
5261
5262         ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
5263         if (ret) {
5264                 mlog_errno(ret);
5265                 return ret;
5266         }
5267
5268         mutex_lock(&tl_inode->i_mutex);
5269
5270         if (ocfs2_truncate_log_needs_flush(osb)) {
5271                 ret = __ocfs2_flush_truncate_log(osb);
5272                 if (ret < 0) {
5273                         mlog_errno(ret);
5274                         goto out;
5275                 }
5276         }
5277
5278         handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
5279         if (IS_ERR(handle)) {
5280                 ret = PTR_ERR(handle);
5281                 mlog_errno(ret);
5282                 goto out;
5283         }
5284
5285         ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
5286                                    OCFS2_JOURNAL_ACCESS_WRITE);
5287         if (ret) {
5288                 mlog_errno(ret);
5289                 goto out;
5290         }
5291
5292         ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
5293                                   dealloc);
5294         if (ret) {
5295                 mlog_errno(ret);
5296                 goto out_commit;
5297         }
5298
5299         ocfs2_et_update_clusters(inode, et, -len);
5300
5301         ret = ocfs2_journal_dirty(handle, et->et_root_bh);
5302         if (ret) {
5303                 mlog_errno(ret);
5304                 goto out_commit;
5305         }
5306
5307         ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
5308         if (ret)
5309                 mlog_errno(ret);
5310
5311 out_commit:
5312         ocfs2_commit_trans(osb, handle);
5313 out:
5314         mutex_unlock(&tl_inode->i_mutex);
5315
5316         if (meta_ac)
5317                 ocfs2_free_alloc_context(meta_ac);
5318
5319         return ret;
5320 }
5321
5322 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5323 {
5324         struct buffer_head *tl_bh = osb->osb_tl_bh;
5325         struct ocfs2_dinode *di;
5326         struct ocfs2_truncate_log *tl;
5327
5328         di = (struct ocfs2_dinode *) tl_bh->b_data;
5329         tl = &di->id2.i_dealloc;
5330
5331         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5332                         "slot %d, invalid truncate log parameters: used = "
5333                         "%u, count = %u\n", osb->slot_num,
5334                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5335         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5336 }
5337
5338 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5339                                            unsigned int new_start)
5340 {
5341         unsigned int tail_index;
5342         unsigned int current_tail;
5343
5344         /* No records, nothing to coalesce */
5345         if (!le16_to_cpu(tl->tl_used))
5346                 return 0;
5347
5348         tail_index = le16_to_cpu(tl->tl_used) - 1;
5349         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5350         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5351
5352         return current_tail == new_start;
5353 }
5354
5355 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5356                               handle_t *handle,
5357                               u64 start_blk,
5358                               unsigned int num_clusters)
5359 {
5360         int status, index;
5361         unsigned int start_cluster, tl_count;
5362         struct inode *tl_inode = osb->osb_tl_inode;
5363         struct buffer_head *tl_bh = osb->osb_tl_bh;
5364         struct ocfs2_dinode *di;
5365         struct ocfs2_truncate_log *tl;
5366
5367         mlog_entry("start_blk = %llu, num_clusters = %u\n",
5368                    (unsigned long long)start_blk, num_clusters);
5369
5370         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5371
5372         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5373
5374         di = (struct ocfs2_dinode *) tl_bh->b_data;
5375
5376         /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5377          * by the underlying call to ocfs2_read_inode_block(), so any
5378          * corruption is a code bug */
5379         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5380
5381         tl = &di->id2.i_dealloc;
5382         tl_count = le16_to_cpu(tl->tl_count);
5383         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5384                         tl_count == 0,
5385                         "Truncate record count on #%llu invalid "
5386                         "wanted %u, actual %u\n",
5387                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5388                         ocfs2_truncate_recs_per_inode(osb->sb),
5389                         le16_to_cpu(tl->tl_count));
5390
5391         /* Caller should have known to flush before calling us. */
5392         index = le16_to_cpu(tl->tl_used);
5393         if (index >= tl_count) {
5394                 status = -ENOSPC;
5395                 mlog_errno(status);
5396                 goto bail;
5397         }
5398
5399         status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5400                                       OCFS2_JOURNAL_ACCESS_WRITE);
5401         if (status < 0) {
5402                 mlog_errno(status);
5403                 goto bail;
5404         }
5405
5406         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5407              "%llu (index = %d)\n", num_clusters, start_cluster,
5408              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5409
5410         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5411                 /*
5412                  * Move index back to the record we are coalescing with.
5413                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5414                  */
5415                 index--;
5416
5417                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5418                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5419                      index, le32_to_cpu(tl->tl_recs[index].t_start),
5420                      num_clusters);
5421         } else {
5422                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5423                 tl->tl_used = cpu_to_le16(index + 1);
5424         }
5425         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5426
5427         status = ocfs2_journal_dirty(handle, tl_bh);
5428         if (status < 0) {
5429                 mlog_errno(status);
5430                 goto bail;
5431         }
5432
5433 bail:
5434         mlog_exit(status);
5435         return status;
5436 }
5437
5438 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5439                                          handle_t *handle,
5440                                          struct inode *data_alloc_inode,
5441                                          struct buffer_head *data_alloc_bh)
5442 {
5443         int status = 0;
5444         int i;
5445         unsigned int num_clusters;
5446         u64 start_blk;
5447         struct ocfs2_truncate_rec rec;
5448         struct ocfs2_dinode *di;
5449         struct ocfs2_truncate_log *tl;
5450         struct inode *tl_inode = osb->osb_tl_inode;
5451         struct buffer_head *tl_bh = osb->osb_tl_bh;
5452
5453         mlog_entry_void();
5454
5455         di = (struct ocfs2_dinode *) tl_bh->b_data;
5456         tl = &di->id2.i_dealloc;
5457         i = le16_to_cpu(tl->tl_used) - 1;
5458         while (i >= 0) {
5459                 /* Caller has given us at least enough credits to
5460                  * update the truncate log dinode */
5461                 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5462                                               OCFS2_JOURNAL_ACCESS_WRITE);
5463                 if (status < 0) {
5464                         mlog_errno(status);
5465                         goto bail;
5466                 }
5467
5468                 tl->tl_used = cpu_to_le16(i);
5469
5470                 status = ocfs2_journal_dirty(handle, tl_bh);
5471                 if (status < 0) {
5472                         mlog_errno(status);
5473                         goto bail;
5474                 }
5475
5476                 /* TODO: Perhaps we can calculate the bulk of the
5477                  * credits up front rather than extending like
5478                  * this. */
5479                 status = ocfs2_extend_trans(handle,
5480                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5481                 if (status < 0) {
5482                         mlog_errno(status);
5483                         goto bail;
5484                 }
5485
5486                 rec = tl->tl_recs[i];
5487                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5488                                                     le32_to_cpu(rec.t_start));
5489                 num_clusters = le32_to_cpu(rec.t_clusters);
5490
5491                 /* if start_blk is not set, we ignore the record as
5492                  * invalid. */
5493                 if (start_blk) {
5494                         mlog(0, "free record %d, start = %u, clusters = %u\n",
5495                              i, le32_to_cpu(rec.t_start), num_clusters);
5496
5497                         status = ocfs2_free_clusters(handle, data_alloc_inode,
5498                                                      data_alloc_bh, start_blk,
5499                                                      num_clusters);
5500                         if (status < 0) {
5501                                 mlog_errno(status);
5502                                 goto bail;
5503                         }
5504                 }
5505                 i--;
5506         }
5507
5508 bail:
5509         mlog_exit(status);
5510         return status;
5511 }
5512
5513 /* Expects you to already be holding tl_inode->i_mutex */
5514 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5515 {
5516         int status;
5517         unsigned int num_to_flush;
5518         handle_t *handle;
5519         struct inode *tl_inode = osb->osb_tl_inode;
5520         struct inode *data_alloc_inode = NULL;
5521         struct buffer_head *tl_bh = osb->osb_tl_bh;
5522         struct buffer_head *data_alloc_bh = NULL;
5523         struct ocfs2_dinode *di;
5524         struct ocfs2_truncate_log *tl;
5525
5526         mlog_entry_void();
5527
5528         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5529
5530         di = (struct ocfs2_dinode *) tl_bh->b_data;
5531
5532         /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5533          * by the underlying call to ocfs2_read_inode_block(), so any
5534          * corruption is a code bug */
5535         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5536
5537         tl = &di->id2.i_dealloc;
5538         num_to_flush = le16_to_cpu(tl->tl_used);
5539         mlog(0, "Flush %u records from truncate log #%llu\n",
5540              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5541         if (!num_to_flush) {
5542                 status = 0;
5543                 goto out;
5544         }
5545
5546         data_alloc_inode = ocfs2_get_system_file_inode(osb,
5547                                                        GLOBAL_BITMAP_SYSTEM_INODE,
5548                                                        OCFS2_INVALID_SLOT);
5549         if (!data_alloc_inode) {
5550                 status = -EINVAL;
5551                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
5552                 goto out;
5553         }
5554
5555         mutex_lock(&data_alloc_inode->i_mutex);
5556
5557         status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5558         if (status < 0) {
5559                 mlog_errno(status);
5560                 goto out_mutex;
5561         }
5562
5563         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5564         if (IS_ERR(handle)) {
5565                 status = PTR_ERR(handle);
5566                 mlog_errno(status);
5567                 goto out_unlock;
5568         }
5569
5570         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5571                                                data_alloc_bh);
5572         if (status < 0)
5573                 mlog_errno(status);
5574
5575         ocfs2_commit_trans(osb, handle);
5576
5577 out_unlock:
5578         brelse(data_alloc_bh);
5579         ocfs2_inode_unlock(data_alloc_inode, 1);
5580
5581 out_mutex:
5582         mutex_unlock(&data_alloc_inode->i_mutex);
5583         iput(data_alloc_inode);
5584
5585 out:
5586         mlog_exit(status);
5587         return status;
5588 }
5589
5590 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5591 {
5592         int status;
5593         struct inode *tl_inode = osb->osb_tl_inode;
5594
5595         mutex_lock(&tl_inode->i_mutex);
5596         status = __ocfs2_flush_truncate_log(osb);
5597         mutex_unlock(&tl_inode->i_mutex);
5598
5599         return status;
5600 }
5601
5602 static void ocfs2_truncate_log_worker(struct work_struct *work)
5603 {
5604         int status;
5605         struct ocfs2_super *osb =
5606                 container_of(work, struct ocfs2_super,
5607                              osb_truncate_log_wq.work);
5608
5609         mlog_entry_void();
5610
5611         status = ocfs2_flush_truncate_log(osb);
5612         if (status < 0)
5613                 mlog_errno(status);
5614         else
5615                 ocfs2_init_inode_steal_slot(osb);
5616
5617         mlog_exit(status);
5618 }
5619
5620 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5621 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5622                                        int cancel)
5623 {
5624         if (osb->osb_tl_inode) {
5625                 /* We want to push off log flushes while truncates are
5626                  * still running. */
5627                 if (cancel)
5628                         cancel_delayed_work(&osb->osb_truncate_log_wq);
5629
5630                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5631                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5632         }
5633 }
5634
5635 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5636                                        int slot_num,
5637                                        struct inode **tl_inode,
5638                                        struct buffer_head **tl_bh)
5639 {
5640         int status;
5641         struct inode *inode = NULL;
5642         struct buffer_head *bh = NULL;
5643
5644         inode = ocfs2_get_system_file_inode(osb,
5645                                            TRUNCATE_LOG_SYSTEM_INODE,
5646                                            slot_num);
5647         if (!inode) {
5648                 status = -EINVAL;
5649                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5650                 goto bail;
5651         }
5652
5653         status = ocfs2_read_inode_block(inode, &bh);
5654         if (status < 0) {
5655                 iput(inode);
5656                 mlog_errno(status);
5657                 goto bail;
5658         }
5659
5660         *tl_inode = inode;
5661         *tl_bh    = bh;
5662 bail:
5663         mlog_exit(status);
5664         return status;
5665 }
5666
5667 /* called during the 1st stage of node recovery. we stamp a clean
5668  * truncate log and pass back a copy for processing later. if the
5669  * truncate log does not require processing, a *tl_copy is set to
5670  * NULL. */
5671 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5672                                       int slot_num,
5673                                       struct ocfs2_dinode **tl_copy)
5674 {
5675         int status;
5676         struct inode *tl_inode = NULL;
5677         struct buffer_head *tl_bh = NULL;
5678         struct ocfs2_dinode *di;
5679         struct ocfs2_truncate_log *tl;
5680
5681         *tl_copy = NULL;
5682
5683         mlog(0, "recover truncate log from slot %d\n", slot_num);
5684
5685         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5686         if (status < 0) {
5687                 mlog_errno(status);
5688                 goto bail;
5689         }
5690
5691         di = (struct ocfs2_dinode *) tl_bh->b_data;
5692
5693         /* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
5694          * validated by the underlying call to ocfs2_read_inode_block(),
5695          * so any corruption is a code bug */
5696         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5697
5698         tl = &di->id2.i_dealloc;
5699         if (le16_to_cpu(tl->tl_used)) {
5700                 mlog(0, "We'll have %u logs to recover\n",
5701                      le16_to_cpu(tl->tl_used));
5702
5703                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
5704                 if (!(*tl_copy)) {
5705                         status = -ENOMEM;
5706                         mlog_errno(status);
5707                         goto bail;
5708                 }
5709
5710                 /* Assuming the write-out below goes well, this copy
5711                  * will be passed back to recovery for processing. */
5712                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
5713
5714                 /* All we need to do to clear the truncate log is set
5715                  * tl_used. */
5716                 tl->tl_used = 0;
5717
5718                 status = ocfs2_write_block(osb, tl_bh, tl_inode);
5719                 if (status < 0) {
5720                         mlog_errno(status);
5721                         goto bail;
5722                 }
5723         }
5724
5725 bail:
5726         if (tl_inode)
5727                 iput(tl_inode);
5728         brelse(tl_bh);
5729
5730         if (status < 0 && (*tl_copy)) {
5731                 kfree(*tl_copy);
5732                 *tl_copy = NULL;
5733         }
5734
5735         mlog_exit(status);
5736         return status;
5737 }
5738
5739 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
5740                                          struct ocfs2_dinode *tl_copy)
5741 {
5742         int status = 0;
5743         int i;
5744         unsigned int clusters, num_recs, start_cluster;
5745         u64 start_blk;
5746         handle_t *handle;
5747         struct inode *tl_inode = osb->osb_tl_inode;
5748         struct ocfs2_truncate_log *tl;
5749
5750         mlog_entry_void();
5751
5752         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
5753                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
5754                 return -EINVAL;
5755         }
5756
5757         tl = &tl_copy->id2.i_dealloc;
5758         num_recs = le16_to_cpu(tl->tl_used);
5759         mlog(0, "cleanup %u records from %llu\n", num_recs,
5760              (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
5761
5762         mutex_lock(&tl_inode->i_mutex);
5763         for(i = 0; i < num_recs; i++) {
5764                 if (ocfs2_truncate_log_needs_flush(osb)) {
5765                         status = __ocfs2_flush_truncate_log(osb);
5766                         if (status < 0) {
5767                                 mlog_errno(status);
5768                                 goto bail_up;
5769                         }
5770                 }
5771
5772                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5773                 if (IS_ERR(handle)) {
5774                         status = PTR_ERR(handle);
5775                         mlog_errno(status);
5776                         goto bail_up;
5777                 }
5778
5779                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
5780                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
5781                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
5782
5783                 status = ocfs2_truncate_log_append(osb, handle,
5784                                                    start_blk, clusters);
5785                 ocfs2_commit_trans(osb, handle);
5786                 if (status < 0) {
5787                         mlog_errno(status);
5788                         goto bail_up;
5789                 }
5790         }
5791
5792 bail_up:
5793         mutex_unlock(&tl_inode->i_mutex);
5794
5795         mlog_exit(status);
5796         return status;
5797 }
5798
5799 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
5800 {
5801         int status;
5802         struct inode *tl_inode = osb->osb_tl_inode;
5803
5804         mlog_entry_void();
5805
5806         if (tl_inode) {
5807                 cancel_delayed_work(&osb->osb_truncate_log_wq);
5808                 flush_workqueue(ocfs2_wq);
5809
5810                 status = ocfs2_flush_truncate_log(osb);
5811                 if (status < 0)
5812                         mlog_errno(status);
5813
5814                 brelse(osb->osb_tl_bh);
5815                 iput(osb->osb_tl_inode);
5816         }
5817
5818         mlog_exit_void();
5819 }
5820
5821 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5822 {
5823         int status;
5824         struct inode *tl_inode = NULL;
5825         struct buffer_head *tl_bh = NULL;
5826
5827         mlog_entry_void();
5828
5829         status = ocfs2_get_truncate_log_info(osb,
5830                                              osb->slot_num,
5831                                              &tl_inode,
5832                                              &tl_bh);
5833         if (status < 0)
5834                 mlog_errno(status);
5835
5836         /* ocfs2_truncate_log_shutdown keys on the existence of
5837          * osb->osb_tl_inode so we don't set any of the osb variables
5838          * until we're sure all is well. */
5839         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
5840                           ocfs2_truncate_log_worker);
5841         osb->osb_tl_bh    = tl_bh;
5842         osb->osb_tl_inode = tl_inode;
5843
5844         mlog_exit(status);
5845         return status;
5846 }
5847
5848 /*
5849  * Delayed de-allocation of suballocator blocks.
5850  *
5851  * Some sets of block de-allocations might involve multiple suballocator inodes.
5852  *
5853  * The locking for this can get extremely complicated, especially when
5854  * the suballocator inodes to delete from aren't known until deep
5855  * within an unrelated codepath.
5856  *
5857  * ocfs2_extent_block structures are a good example of this - an inode
5858  * btree could have been grown by any number of nodes each allocating
5859  * out of their own suballoc inode.
5860  *
5861  * These structures allow the delay of block de-allocation until a
5862  * later time, when locking of multiple cluster inodes won't cause
5863  * deadlock.
5864  */
5865
5866 /*
5867  * Describe a single bit freed from a suballocator.  For the block
5868  * suballocators, it represents one block.  For the global cluster
5869  * allocator, it represents some clusters and free_bit indicates
5870  * clusters number.
5871  */
5872 struct ocfs2_cached_block_free {
5873         struct ocfs2_cached_block_free          *free_next;
5874         u64                                     free_blk;
5875         unsigned int                            free_bit;
5876 };
5877
5878 struct ocfs2_per_slot_free_list {
5879         struct ocfs2_per_slot_free_list         *f_next_suballocator;
5880         int                                     f_inode_type;
5881         int                                     f_slot;
5882         struct ocfs2_cached_block_free          *f_first;
5883 };
5884
5885 static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
5886                                     int sysfile_type,
5887                                     int slot,
5888                                     struct ocfs2_cached_block_free *head)
5889 {
5890         int ret;
5891         u64 bg_blkno;
5892         handle_t *handle;
5893         struct inode *inode;
5894         struct buffer_head *di_bh = NULL;
5895         struct ocfs2_cached_block_free *tmp;
5896
5897         inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5898         if (!inode) {
5899                 ret = -EINVAL;
5900                 mlog_errno(ret);
5901                 goto out;
5902         }
5903
5904         mutex_lock(&inode->i_mutex);
5905
5906         ret = ocfs2_inode_lock(inode, &di_bh, 1);
5907         if (ret) {
5908                 mlog_errno(ret);
5909                 goto out_mutex;
5910         }
5911
5912         handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5913         if (IS_ERR(handle)) {
5914                 ret = PTR_ERR(handle);
5915                 mlog_errno(ret);
5916                 goto out_unlock;
5917         }
5918
5919         while (head) {
5920                 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5921                                                       head->free_bit);
5922                 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5923                      head->free_bit, (unsigned long long)head->free_blk);
5924
5925                 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5926                                                head->free_bit, bg_blkno, 1);
5927                 if (ret) {
5928                         mlog_errno(ret);
5929                         goto out_journal;
5930                 }
5931
5932                 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5933                 if (ret) {
5934                         mlog_errno(ret);
5935                         goto out_journal;
5936                 }
5937
5938                 tmp = head;
5939                 head = head->free_next;
5940                 kfree(tmp);
5941         }
5942
5943 out_journal:
5944         ocfs2_commit_trans(osb, handle);
5945
5946 out_unlock:
5947         ocfs2_inode_unlock(inode, 1);
5948         brelse(di_bh);
5949 out_mutex:
5950         mutex_unlock(&inode->i_mutex);
5951         iput(inode);
5952 out:
5953         while(head) {
5954                 /* Premature exit may have left some dangling items. */
5955                 tmp = head;
5956                 head = head->free_next;
5957                 kfree(tmp);
5958         }
5959
5960         return ret;
5961 }
5962
5963 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
5964                                 u64 blkno, unsigned int bit)
5965 {
5966         int ret = 0;
5967         struct ocfs2_cached_block_free *item;
5968
5969         item = kmalloc(sizeof(*item), GFP_NOFS);
5970         if (item == NULL) {
5971                 ret = -ENOMEM;
5972                 mlog_errno(ret);
5973                 return ret;
5974         }
5975
5976         mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
5977              bit, (unsigned long long)blkno);
5978
5979         item->free_blk = blkno;
5980         item->free_bit = bit;
5981         item->free_next = ctxt->c_global_allocator;
5982
5983         ctxt->c_global_allocator = item;
5984         return ret;
5985 }
5986
5987 static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
5988                                       struct ocfs2_cached_block_free *head)
5989 {
5990         struct ocfs2_cached_block_free *tmp;
5991         struct inode *tl_inode = osb->osb_tl_inode;
5992         handle_t *handle;
5993         int ret = 0;
5994
5995         mutex_lock(&tl_inode->i_mutex);
5996
5997         while (head) {
5998                 if (ocfs2_truncate_log_needs_flush(osb)) {
5999                         ret = __ocfs2_flush_truncate_log(osb);
6000                         if (ret < 0) {
6001                                 mlog_errno(ret);
6002                                 break;
6003                         }
6004                 }
6005
6006                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6007                 if (IS_ERR(handle)) {
6008                         ret = PTR_ERR(handle);
6009                         mlog_errno(ret);
6010                         break;
6011                 }
6012
6013                 ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
6014                                                 head->free_bit);
6015
6016                 ocfs2_commit_trans(osb, handle);
6017                 tmp = head;
6018                 head = head->free_next;
6019                 kfree(tmp);
6020
6021                 if (ret < 0) {
6022                         mlog_errno(ret);
6023                         break;
6024                 }
6025         }
6026
6027         mutex_unlock(&tl_inode->i_mutex);
6028
6029         while (head) {
6030                 /* Premature exit may have left some dangling items. */
6031                 tmp = head;
6032                 head = head->free_next;
6033                 kfree(tmp);
6034         }
6035
6036         return ret;
6037 }
6038
6039 int ocfs2_run_deallocs(struct ocfs2_super *osb,
6040                        struct ocfs2_cached_dealloc_ctxt *ctxt)
6041 {
6042         int ret = 0, ret2;
6043         struct ocfs2_per_slot_free_list *fl;
6044
6045         if (!ctxt)
6046                 return 0;
6047
6048         while (ctxt->c_first_suballocator) {
6049                 fl = ctxt->c_first_suballocator;
6050
6051                 if (fl->f_first) {
6052                         mlog(0, "Free items: (type %u, slot %d)\n",
6053                              fl->f_inode_type, fl->f_slot);
6054                         ret2 = ocfs2_free_cached_blocks(osb,
6055                                                         fl->f_inode_type,
6056                                                         fl->f_slot,
6057                                                         fl->f_first);
6058                         if (ret2)
6059                                 mlog_errno(ret2);
6060                         if (!ret)
6061                                 ret = ret2;
6062                 }
6063
6064                 ctxt->c_first_suballocator = fl->f_next_suballocator;
6065                 kfree(fl);
6066         }
6067
6068         if (ctxt->c_global_allocator) {
6069                 ret2 = ocfs2_free_cached_clusters(osb,
6070                                                   ctxt->c_global_allocator);
6071                 if (ret2)
6072                         mlog_errno(ret2);
6073                 if (!ret)
6074                         ret = ret2;
6075
6076                 ctxt->c_global_allocator = NULL;
6077         }
6078
6079         return ret;
6080 }
6081
6082 static struct ocfs2_per_slot_free_list *
6083 ocfs2_find_per_slot_free_list(int type,
6084                               int slot,
6085                               struct ocfs2_cached_dealloc_ctxt *ctxt)
6086 {
6087         struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
6088
6089         while (fl) {
6090                 if (fl->f_inode_type == type && fl->f_slot == slot)
6091                         return fl;
6092
6093                 fl = fl->f_next_suballocator;
6094         }
6095
6096         fl = kmalloc(sizeof(*fl), GFP_NOFS);
6097         if (fl) {
6098                 fl->f_inode_type = type;
6099                 fl->f_slot = slot;
6100                 fl->f_first = NULL;
6101                 fl->f_next_suballocator = ctxt->c_first_suballocator;
6102
6103                 ctxt->c_first_suballocator = fl;
6104         }
6105         return fl;
6106 }
6107
6108 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6109                                      int type, int slot, u64 blkno,
6110                                      unsigned int bit)
6111 {
6112         int ret;
6113         struct ocfs2_per_slot_free_list *fl;
6114         struct ocfs2_cached_block_free *item;
6115
6116         fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
6117         if (fl == NULL) {
6118                 ret = -ENOMEM;
6119                 mlog_errno(ret);
6120                 goto out;
6121         }
6122
6123         item = kmalloc(sizeof(*item), GFP_NOFS);
6124         if (item == NULL) {
6125                 ret = -ENOMEM;
6126                 mlog_errno(ret);
6127                 goto out;
6128         }
6129
6130         mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
6131              type, slot, bit, (unsigned long long)blkno);
6132
6133         item->free_blk = blkno;
6134         item->free_bit = bit;
6135         item->free_next = fl->f_first;
6136
6137         fl->f_first = item;
6138
6139         ret = 0;
6140 out:
6141         return ret;
6142 }
6143
6144 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6145                                          struct ocfs2_extent_block *eb)
6146 {
6147         return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6148                                          le16_to_cpu(eb->h_suballoc_slot),
6149                                          le64_to_cpu(eb->h_blkno),
6150                                          le16_to_cpu(eb->h_suballoc_bit));
6151 }
6152
6153 /* This function will figure out whether the currently last extent
6154  * block will be deleted, and if it will, what the new last extent
6155  * block will be so we can update his h_next_leaf_blk field, as well
6156  * as the dinodes i_last_eb_blk */
6157 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6158                                        unsigned int clusters_to_del,
6159                                        struct ocfs2_path *path,
6160                                        struct buffer_head **new_last_eb)
6161 {
6162         int next_free, ret = 0;
6163         u32 cpos;
6164         struct ocfs2_extent_rec *rec;
6165         struct ocfs2_extent_block *eb;
6166         struct ocfs2_extent_list *el;
6167         struct buffer_head *bh = NULL;
6168
6169         *new_last_eb = NULL;
6170
6171         /* we have no tree, so of course, no last_eb. */
6172         if (!path->p_tree_depth)
6173                 goto out;
6174
6175         /* trunc to zero special case - this makes tree_depth = 0
6176          * regardless of what it is.  */
6177         if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6178                 goto out;
6179
6180         el = path_leaf_el(path);
6181         BUG_ON(!el->l_next_free_rec);
6182
6183         /*
6184          * Make sure that this extent list will actually be empty
6185          * after we clear away the data. We can shortcut out if
6186          * there's more than one non-empty extent in the
6187          * list. Otherwise, a check of the remaining extent is
6188          * necessary.
6189          */
6190         next_free = le16_to_cpu(el->l_next_free_rec);
6191         rec = NULL;
6192         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6193                 if (next_free > 2)
6194                         goto out;
6195
6196                 /* We may have a valid extent in index 1, check it. */
6197                 if (next_free == 2)
6198                         rec = &el->l_recs[1];
6199
6200                 /*
6201                  * Fall through - no more nonempty extents, so we want
6202                  * to delete this leaf.
6203                  */
6204         } else {
6205                 if (next_free > 1)
6206                         goto out;
6207
6208                 rec = &el->l_recs[0];
6209         }
6210
6211         if (rec) {
6212                 /*
6213                  * Check it we'll only be trimming off the end of this
6214                  * cluster.
6215                  */
6216                 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6217                         goto out;
6218         }
6219
6220         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6221         if (ret) {
6222                 mlog_errno(ret);
6223                 goto out;
6224         }
6225
6226         ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
6227         if (ret) {
6228                 mlog_errno(ret);
6229                 goto out;
6230         }
6231
6232         eb = (struct ocfs2_extent_block *) bh->b_data;
6233         el = &eb->h_list;
6234         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6235                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6236                 ret = -EROFS;
6237                 goto out;
6238         }
6239
6240         *new_last_eb = bh;
6241         get_bh(*new_last_eb);
6242         mlog(0, "returning block %llu, (cpos: %u)\n",
6243              (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6244 out:
6245         brelse(bh);
6246
6247         return ret;
6248 }
6249
6250 /*
6251  * Trim some clusters off the rightmost edge of a tree. Only called
6252  * during truncate.
6253  *
6254  * The caller needs to:
6255  *   - start journaling of each path component.
6256  *   - compute and fully set up any new last ext block
6257  */
6258 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6259                            handle_t *handle, struct ocfs2_truncate_context *tc,
6260                            u32 clusters_to_del, u64 *delete_start)
6261 {
6262         int ret, i, index = path->p_tree_depth;
6263         u32 new_edge = 0;
6264         u64 deleted_eb = 0;
6265         struct buffer_head *bh;
6266         struct ocfs2_extent_list *el;
6267         struct ocfs2_extent_rec *rec;
6268
6269         *delete_start = 0;
6270
6271         while (index >= 0) {
6272                 bh = path->p_node[index].bh;
6273                 el = path->p_node[index].el;
6274
6275                 mlog(0, "traveling tree (index = %d, block = %llu)\n",
6276                      index,  (unsigned long long)bh->b_blocknr);
6277
6278                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6279
6280                 if (index !=
6281                     (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6282                         ocfs2_error(inode->i_sb,
6283                                     "Inode %lu has invalid ext. block %llu",
6284                                     inode->i_ino,
6285                                     (unsigned long long)bh->b_blocknr);
6286                         ret = -EROFS;
6287                         goto out;
6288                 }
6289
6290 find_tail_record:
6291                 i = le16_to_cpu(el->l_next_free_rec) - 1;
6292                 rec = &el->l_recs[i];
6293
6294                 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6295                      "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6296                      ocfs2_rec_clusters(el, rec),
6297                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6298                      le16_to_cpu(el->l_next_free_rec));
6299
6300                 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6301
6302                 if (le16_to_cpu(el->l_tree_depth) == 0) {
6303                         /*
6304                          * If the leaf block contains a single empty
6305                          * extent and no records, we can just remove
6306                          * the block.
6307                          */
6308                         if (i == 0 && ocfs2_is_empty_extent(rec)) {
6309                                 memset(rec, 0,
6310                                        sizeof(struct ocfs2_extent_rec));
6311                                 el->l_next_free_rec = cpu_to_le16(0);
6312
6313                                 goto delete;
6314                         }
6315
6316                         /*
6317                          * Remove any empty extents by shifting things
6318                          * left. That should make life much easier on
6319                          * the code below. This condition is rare
6320                          * enough that we shouldn't see a performance
6321                          * hit.
6322                          */
6323                         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6324                                 le16_add_cpu(&el->l_next_free_rec, -1);
6325
6326                                 for(i = 0;
6327                                     i < le16_to_cpu(el->l_next_free_rec); i++)
6328                                         el->l_recs[i] = el->l_recs[i + 1];
6329
6330                                 memset(&el->l_recs[i], 0,
6331                                        sizeof(struct ocfs2_extent_rec));
6332
6333                                 /*
6334                                  * We've modified our extent list. The
6335                                  * simplest way to handle this change
6336                                  * is to being the search from the
6337                                  * start again.
6338                                  */
6339                                 goto find_tail_record;
6340                         }
6341
6342                         le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6343
6344                         /*
6345                          * We'll use "new_edge" on our way back up the
6346                          * tree to know what our rightmost cpos is.
6347                          */
6348                         new_edge = le16_to_cpu(rec->e_leaf_clusters);
6349                         new_edge += le32_to_cpu(rec->e_cpos);
6350
6351                         /*
6352                          * The caller will use this to delete data blocks.
6353                          */
6354                         *delete_start = le64_to_cpu(rec->e_blkno)
6355                                 + ocfs2_clusters_to_blocks(inode->i_sb,
6356                                         le16_to_cpu(rec->e_leaf_clusters));
6357
6358                         /*
6359                          * If it's now empty, remove this record.
6360                          */
6361                         if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6362                                 memset(rec, 0,
6363                                        sizeof(struct ocfs2_extent_rec));
6364                                 le16_add_cpu(&el->l_next_free_rec, -1);
6365                         }
6366                 } else {
6367                         if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6368                                 memset(rec, 0,
6369                                        sizeof(struct ocfs2_extent_rec));
6370                                 le16_add_cpu(&el->l_next_free_rec, -1);
6371
6372                                 goto delete;
6373                         }
6374
6375                         /* Can this actually happen? */
6376                         if (le16_to_cpu(el->l_next_free_rec) == 0)
6377                                 goto delete;
6378
6379                         /*
6380                          * We never actually deleted any clusters
6381                          * because our leaf was empty. There's no
6382                          * reason to adjust the rightmost edge then.
6383                          */
6384                         if (new_edge == 0)
6385                                 goto delete;
6386
6387                         rec->e_int_clusters = cpu_to_le32(new_edge);
6388                         le32_add_cpu(&rec->e_int_clusters,
6389                                      -le32_to_cpu(rec->e_cpos));
6390
6391                          /*
6392                           * A deleted child record should have been
6393                           * caught above.
6394                           */
6395                          BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6396                 }
6397
6398 delete:
6399                 ret = ocfs2_journal_dirty(handle, bh);
6400                 if (ret) {
6401                         mlog_errno(ret);
6402                         goto out;
6403                 }
6404
6405                 mlog(0, "extent list container %llu, after: record %d: "
6406                      "(%u, %u, %llu), next = %u.\n",
6407                      (unsigned long long)bh->b_blocknr, i,
6408                      le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6409                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6410                      le16_to_cpu(el->l_next_free_rec));
6411
6412                 /*
6413                  * We must be careful to only attempt delete of an
6414                  * extent block (and not the root inode block).
6415                  */
6416                 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6417                         struct ocfs2_extent_block *eb =
6418                                 (struct ocfs2_extent_block *)bh->b_data;
6419
6420                         /*
6421                          * Save this for use when processing the
6422                          * parent block.
6423                          */
6424                         deleted_eb = le64_to_cpu(eb->h_blkno);
6425
6426                         mlog(0, "deleting this extent block.\n");
6427
6428                         ocfs2_remove_from_cache(inode, bh);
6429
6430                         BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6431                         BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6432                         BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6433
6434                         ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6435                         /* An error here is not fatal. */
6436                         if (ret < 0)
6437                                 mlog_errno(ret);
6438                 } else {
6439                         deleted_eb = 0;
6440                 }
6441
6442                 index--;
6443         }
6444
6445         ret = 0;
6446 out:
6447         return ret;
6448 }
6449
6450 static int ocfs2_do_truncate(struct ocfs2_super *osb,
6451                              unsigned int clusters_to_del,
6452                              struct inode *inode,
6453                              struct buffer_head *fe_bh,
6454                              handle_t *handle,
6455                              struct ocfs2_truncate_context *tc,
6456                              struct ocfs2_path *path)
6457 {
6458         int status;
6459         struct ocfs2_dinode *fe;
6460         struct ocfs2_extent_block *last_eb = NULL;
6461         struct ocfs2_extent_list *el;
6462         struct buffer_head *last_eb_bh = NULL;
6463         u64 delete_blk = 0;
6464
6465         fe = (struct ocfs2_dinode *) fe_bh->b_data;
6466
6467         status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6468                                              path, &last_eb_bh);
6469         if (status < 0) {
6470                 mlog_errno(status);
6471                 goto bail;
6472         }
6473
6474         /*
6475          * Each component will be touched, so we might as well journal
6476          * here to avoid having to handle errors later.
6477          */
6478         status = ocfs2_journal_access_path(inode, handle, path);
6479         if (status < 0) {
6480                 mlog_errno(status);
6481                 goto bail;
6482         }
6483
6484         if (last_eb_bh) {
6485                 status = ocfs2_journal_access(handle, inode, last_eb_bh,
6486                                               OCFS2_JOURNAL_ACCESS_WRITE);
6487                 if (status < 0) {
6488                         mlog_errno(status);
6489                         goto bail;
6490                 }
6491
6492                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6493         }
6494
6495         el = &(fe->id2.i_list);
6496
6497         /*
6498          * Lower levels depend on this never happening, but it's best
6499          * to check it up here before changing the tree.
6500          */
6501         if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6502                 ocfs2_error(inode->i_sb,
6503                             "Inode %lu has an empty extent record, depth %u\n",
6504                             inode->i_ino, le16_to_cpu(el->l_tree_depth));
6505                 status = -EROFS;
6506                 goto bail;
6507         }
6508
6509         spin_lock(&OCFS2_I(inode)->ip_lock);
6510         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6511                                       clusters_to_del;
6512         spin_unlock(&OCFS2_I(inode)->ip_lock);
6513         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6514         inode->i_blocks = ocfs2_inode_sector_count(inode);
6515
6516         status = ocfs2_trim_tree(inode, path, handle, tc,
6517                                  clusters_to_del, &delete_blk);
6518         if (status) {
6519                 mlog_errno(status);
6520                 goto bail;
6521         }
6522
6523         if (le32_to_cpu(fe->i_clusters) == 0) {
6524                 /* trunc to zero is a special case. */
6525                 el->l_tree_depth = 0;
6526                 fe->i_last_eb_blk = 0;
6527         } else if (last_eb)
6528                 fe->i_last_eb_blk = last_eb->h_blkno;
6529
6530         status = ocfs2_journal_dirty(handle, fe_bh);
6531         if (status < 0) {
6532                 mlog_errno(status);
6533                 goto bail;
6534         }
6535
6536         if (last_eb) {
6537                 /* If there will be a new last extent block, then by
6538                  * definition, there cannot be any leaves to the right of
6539                  * him. */
6540                 last_eb->h_next_leaf_blk = 0;
6541                 status = ocfs2_journal_dirty(handle, last_eb_bh);
6542                 if (status < 0) {
6543                         mlog_errno(status);
6544                         goto bail;
6545                 }
6546         }
6547
6548         if (delete_blk) {
6549                 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
6550                                                    clusters_to_del);
6551                 if (status < 0) {
6552                         mlog_errno(status);
6553                         goto bail;
6554                 }
6555         }
6556         status = 0;
6557 bail:
6558
6559         mlog_exit(status);
6560         return status;
6561 }
6562
6563 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6564 {
6565         set_buffer_uptodate(bh);
6566         mark_buffer_dirty(bh);
6567         return 0;
6568 }
6569
6570 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6571                                      unsigned int from, unsigned int to,
6572                                      struct page *page, int zero, u64 *phys)
6573 {
6574         int ret, partial = 0;
6575
6576         ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6577         if (ret)
6578                 mlog_errno(ret);
6579
6580         if (zero)
6581                 zero_user_segment(page, from, to);
6582
6583         /*
6584          * Need to set the buffers we zero'd into uptodate
6585          * here if they aren't - ocfs2_map_page_blocks()
6586          * might've skipped some
6587          */
6588         ret = walk_page_buffers(handle, page_buffers(page),
6589                                 from, to, &partial,
6590                                 ocfs2_zero_func);
6591         if (ret < 0)
6592                 mlog_errno(ret);
6593         else if (ocfs2_should_order_data(inode)) {
6594                 ret = ocfs2_jbd2_file_inode(handle, inode);
6595 #ifdef CONFIG_OCFS2_COMPAT_JBD
6596                 ret = walk_page_buffers(handle, page_buffers(page),
6597                                         from, to, &partial,
6598                                         ocfs2_journal_dirty_data);
6599 #endif
6600                 if (ret < 0)
6601                         mlog_errno(ret);
6602         }
6603
6604         if (!partial)
6605                 SetPageUptodate(page);
6606
6607         flush_dcache_page(page);
6608 }
6609
6610 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6611                                      loff_t end, struct page **pages,
6612                                      int numpages, u64 phys, handle_t *handle)
6613 {
6614         int i;
6615         struct page *page;
6616         unsigned int from, to = PAGE_CACHE_SIZE;
6617         struct super_block *sb = inode->i_sb;
6618
6619         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6620
6621         if (numpages == 0)
6622                 goto out;
6623
6624         to = PAGE_CACHE_SIZE;
6625         for(i = 0; i < numpages; i++) {
6626                 page = pages[i];
6627
6628                 from = start & (PAGE_CACHE_SIZE - 1);
6629                 if ((end >> PAGE_CACHE_SHIFT) == page->index)
6630                         to = end & (PAGE_CACHE_SIZE - 1);
6631
6632                 BUG_ON(from > PAGE_CACHE_SIZE);
6633                 BUG_ON(to > PAGE_CACHE_SIZE);
6634
6635                 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6636                                          &phys);
6637
6638                 start = (page->index + 1) << PAGE_CACHE_SHIFT;
6639         }
6640 out:
6641         if (pages)
6642                 ocfs2_unlock_and_free_pages(pages, numpages);
6643 }
6644
6645 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6646                                 struct page **pages, int *num)
6647 {
6648         int numpages, ret = 0;
6649         struct super_block *sb = inode->i_sb;
6650         struct address_space *mapping = inode->i_mapping;
6651         unsigned long index;
6652         loff_t last_page_bytes;
6653
6654         BUG_ON(start > end);
6655
6656         BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6657                (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6658
6659         numpages = 0;
6660         last_page_bytes = PAGE_ALIGN(end);
6661         index = start >> PAGE_CACHE_SHIFT;
6662         do {
6663                 pages[numpages] = grab_cache_page(mapping, index);
6664                 if (!pages[numpages]) {
6665                         ret = -ENOMEM;
6666                         mlog_errno(ret);
6667                         goto out;
6668                 }
6669
6670                 numpages++;
6671                 index++;
6672         } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6673
6674 out:
6675         if (ret != 0) {
6676                 if (pages)
6677                         ocfs2_unlock_and_free_pages(pages, numpages);
6678                 numpages = 0;
6679         }
6680
6681         *num = numpages;
6682
6683         return ret;
6684 }
6685
6686 /*
6687  * Zero the area past i_size but still within an allocated
6688  * cluster. This avoids exposing nonzero data on subsequent file
6689  * extends.
6690  *
6691  * We need to call this before i_size is updated on the inode because
6692  * otherwise block_write_full_page() will skip writeout of pages past
6693  * i_size. The new_i_size parameter is passed for this reason.
6694  */
6695 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6696                                   u64 range_start, u64 range_end)
6697 {
6698         int ret = 0, numpages;
6699         struct page **pages = NULL;
6700         u64 phys;
6701         unsigned int ext_flags;
6702         struct super_block *sb = inode->i_sb;
6703
6704         /*
6705          * File systems which don't support sparse files zero on every
6706          * extend.
6707          */
6708         if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6709                 return 0;
6710
6711         pages = kcalloc(ocfs2_pages_per_cluster(sb),
6712                         sizeof(struct page *), GFP_NOFS);
6713         if (pages == NULL) {
6714                 ret = -ENOMEM;
6715                 mlog_errno(ret);
6716                 goto out;
6717         }
6718
6719         if (range_start == range_end)
6720                 goto out;
6721
6722         ret = ocfs2_extent_map_get_blocks(inode,
6723                                           range_start >> sb->s_blocksize_bits,
6724                                           &phys, NULL, &ext_flags);
6725         if (ret) {
6726                 mlog_errno(ret);
6727                 goto out;
6728         }
6729
6730         /*
6731          * Tail is a hole, or is marked unwritten. In either case, we
6732          * can count on read and write to return/push zero's.
6733          */
6734         if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6735                 goto out;
6736
6737         ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6738                                    &numpages);
6739         if (ret) {
6740                 mlog_errno(ret);
6741                 goto out;
6742         }
6743
6744         ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6745                                  numpages, phys, handle);
6746
6747         /*
6748          * Initiate writeout of the pages we zero'd here. We don't
6749          * wait on them - the truncate_inode_pages() call later will
6750          * do that for us.
6751          */
6752         ret = do_sync_mapping_range(inode->i_mapping, range_start,
6753                                     range_end - 1, SYNC_FILE_RANGE_WRITE);
6754         if (ret)
6755                 mlog_errno(ret);
6756
6757 out:
6758         if (pages)
6759                 kfree(pages);
6760
6761         return ret;
6762 }
6763
6764 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
6765                                              struct ocfs2_dinode *di)
6766 {
6767         unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6768         unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
6769
6770         if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
6771                 memset(&di->id2, 0, blocksize -
6772                                     offsetof(struct ocfs2_dinode, id2) -
6773                                     xattrsize);
6774         else
6775                 memset(&di->id2, 0, blocksize -
6776                                     offsetof(struct ocfs2_dinode, id2));
6777 }
6778
6779 void ocfs2_dinode_new_extent_list(struct inode *inode,
6780                                   struct ocfs2_dinode *di)
6781 {
6782         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6783         di->id2.i_list.l_tree_depth = 0;
6784         di->id2.i_list.l_next_free_rec = 0;
6785         di->id2.i_list.l_count = cpu_to_le16(
6786                 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
6787 }
6788
6789 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6790 {
6791         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6792         struct ocfs2_inline_data *idata = &di->id2.i_data;
6793
6794         spin_lock(&oi->ip_lock);
6795         oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6796         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6797         spin_unlock(&oi->ip_lock);
6798
6799         /*
6800          * We clear the entire i_data structure here so that all
6801          * fields can be properly initialized.
6802          */
6803         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6804
6805         idata->id_count = cpu_to_le16(
6806                         ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
6807 }
6808
6809 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6810                                          struct buffer_head *di_bh)
6811 {
6812         int ret, i, has_data, num_pages = 0;
6813         handle_t *handle;
6814         u64 uninitialized_var(block);
6815         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6816         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6817         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6818         struct ocfs2_alloc_context *data_ac = NULL;
6819         struct page **pages = NULL;
6820         loff_t end = osb->s_clustersize;
6821         struct ocfs2_extent_tree et;
6822
6823         has_data = i_size_read(inode) ? 1 : 0;
6824
6825         if (has_data) {
6826                 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6827                                 sizeof(struct page *), GFP_NOFS);
6828                 if (pages == NULL) {
6829                         ret = -ENOMEM;
6830                         mlog_errno(ret);
6831                         goto out;
6832                 }
6833
6834                 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6835                 if (ret) {
6836                         mlog_errno(ret);
6837                         goto out;
6838                 }
6839         }
6840
6841         handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
6842         if (IS_ERR(handle)) {
6843                 ret = PTR_ERR(handle);
6844                 mlog_errno(ret);
6845                 goto out_unlock;
6846         }
6847
6848         ret = ocfs2_journal_access(handle, inode, di_bh,
6849                                    OCFS2_JOURNAL_ACCESS_WRITE);
6850         if (ret) {
6851                 mlog_errno(ret);
6852                 goto out_commit;
6853         }
6854
6855         if (has_data) {
6856                 u32 bit_off, num;
6857                 unsigned int page_end;
6858                 u64 phys;
6859
6860                 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6861                                            &num);
6862                 if (ret) {
6863                         mlog_errno(ret);
6864                         goto out_commit;
6865                 }
6866
6867                 /*
6868                  * Save two copies, one for insert, and one that can
6869                  * be changed by ocfs2_map_and_dirty_page() below.
6870                  */
6871                 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6872
6873                 /*
6874                  * Non sparse file systems zero on extend, so no need
6875                  * to do that now.
6876                  */
6877                 if (!ocfs2_sparse_alloc(osb) &&
6878                     PAGE_CACHE_SIZE < osb->s_clustersize)
6879                         end = PAGE_CACHE_SIZE;
6880
6881                 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6882                 if (ret) {
6883                         mlog_errno(ret);
6884                         goto out_commit;
6885                 }
6886
6887                 /*
6888                  * This should populate the 1st page for us and mark
6889                  * it up to date.
6890                  */
6891                 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6892                 if (ret) {
6893                         mlog_errno(ret);
6894                         goto out_commit;
6895                 }
6896
6897                 page_end = PAGE_CACHE_SIZE;
6898                 if (PAGE_CACHE_SIZE > osb->s_clustersize)
6899                         page_end = osb->s_clustersize;
6900
6901                 for (i = 0; i < num_pages; i++)
6902                         ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
6903                                                  pages[i], i > 0, &phys);
6904         }
6905
6906         spin_lock(&oi->ip_lock);
6907         oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
6908         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6909         spin_unlock(&oi->ip_lock);
6910
6911         ocfs2_dinode_new_extent_list(inode, di);
6912
6913         ocfs2_journal_dirty(handle, di_bh);
6914
6915         if (has_data) {
6916                 /*
6917                  * An error at this point should be extremely rare. If
6918                  * this proves to be false, we could always re-build
6919                  * the in-inode data from our pages.
6920                  */
6921                 ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
6922                 ret = ocfs2_insert_extent(osb, handle, inode, &et,
6923                                           0, block, 1, 0, NULL);
6924                 if (ret) {
6925                         mlog_errno(ret);
6926                         goto out_commit;
6927                 }
6928
6929                 inode->i_blocks = ocfs2_inode_sector_count(inode);
6930         }
6931
6932 out_commit:
6933         ocfs2_commit_trans(osb, handle);
6934
6935 out_unlock:
6936         if (data_ac)
6937                 ocfs2_free_alloc_context(data_ac);
6938
6939 out:
6940         if (pages) {
6941                 ocfs2_unlock_and_free_pages(pages, num_pages);
6942                 kfree(pages);
6943         }
6944
6945         return ret;
6946 }
6947
6948 /*
6949  * It is expected, that by the time you call this function,
6950  * inode->i_size and fe->i_size have been adjusted.
6951  *
6952  * WARNING: This will kfree the truncate context
6953  */
6954 int ocfs2_commit_truncate(struct ocfs2_super *osb,
6955                           struct inode *inode,
6956                           struct buffer_head *fe_bh,
6957                           struct ocfs2_truncate_context *tc)
6958 {
6959         int status, i, credits, tl_sem = 0;
6960         u32 clusters_to_del, new_highest_cpos, range;
6961         struct ocfs2_extent_list *el;
6962         handle_t *handle = NULL;
6963         struct inode *tl_inode = osb->osb_tl_inode;
6964         struct ocfs2_path *path = NULL;
6965         struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
6966
6967         mlog_entry_void();
6968
6969         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6970                                                      i_size_read(inode));
6971
6972         path = ocfs2_new_path(fe_bh, &di->id2.i_list);
6973         if (!path) {
6974                 status = -ENOMEM;
6975                 mlog_errno(status);
6976                 goto bail;
6977         }
6978
6979         ocfs2_extent_map_trunc(inode, new_highest_cpos);
6980
6981 start:
6982         /*
6983          * Check that we still have allocation to delete.
6984          */
6985         if (OCFS2_I(inode)->ip_clusters == 0) {
6986                 status = 0;
6987                 goto bail;
6988         }
6989
6990         /*
6991          * Truncate always works against the rightmost tree branch.
6992          */
6993         status = ocfs2_find_path(inode, path, UINT_MAX);
6994         if (status) {
6995                 mlog_errno(status);
6996                 goto bail;
6997         }
6998
6999         mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
7000              OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
7001
7002         /*
7003          * By now, el will point to the extent list on the bottom most
7004          * portion of this tree. Only the tail record is considered in
7005          * each pass.
7006          *
7007          * We handle the following cases, in order:
7008          * - empty extent: delete the remaining branch
7009          * - remove the entire record
7010          * - remove a partial record
7011          * - no record needs to be removed (truncate has completed)
7012          */
7013         el = path_leaf_el(path);
7014         if (le16_to_cpu(el->l_next_free_rec) == 0) {
7015                 ocfs2_error(inode->i_sb,
7016                             "Inode %llu has empty extent block at %llu\n",
7017                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7018                             (unsigned long long)path_leaf_bh(path)->b_blocknr);
7019                 status = -EROFS;
7020                 goto bail;
7021         }
7022
7023         i = le16_to_cpu(el->l_next_free_rec) - 1;
7024         range = le32_to_cpu(el->l_recs[i].e_cpos) +
7025                 ocfs2_rec_clusters(el, &el->l_recs[i]);
7026         if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
7027                 clusters_to_del = 0;
7028         } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
7029                 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
7030         } else if (range > new_highest_cpos) {
7031                 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
7032                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
7033                                   new_highest_cpos;
7034         } else {
7035                 status = 0;
7036                 goto bail;
7037         }
7038
7039         mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
7040              clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
7041
7042         mutex_lock(&tl_inode->i_mutex);
7043         tl_sem = 1;
7044         /* ocfs2_truncate_log_needs_flush guarantees us at least one
7045          * record is free for use. If there isn't any, we flush to get
7046          * an empty truncate log.  */
7047         if (ocfs2_truncate_log_needs_flush(osb)) {
7048                 status = __ocfs2_flush_truncate_log(osb);
7049                 if (status < 0) {
7050                         mlog_errno(status);
7051                         goto bail;
7052                 }
7053         }
7054
7055         credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
7056                                                 (struct ocfs2_dinode *)fe_bh->b_data,
7057                                                 el);
7058         handle = ocfs2_start_trans(osb, credits);
7059         if (IS_ERR(handle)) {
7060                 status = PTR_ERR(handle);
7061                 handle = NULL;
7062                 mlog_errno(status);
7063                 goto bail;
7064         }
7065
7066         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
7067                                    tc, path);
7068         if (status < 0) {
7069                 mlog_errno(status);
7070                 goto bail;
7071         }
7072
7073         mutex_unlock(&tl_inode->i_mutex);
7074         tl_sem = 0;
7075
7076         ocfs2_commit_trans(osb, handle);
7077         handle = NULL;
7078
7079         ocfs2_reinit_path(path, 1);
7080
7081         /*
7082          * The check above will catch the case where we've truncated
7083          * away all allocation.
7084          */
7085         goto start;
7086
7087 bail:
7088
7089         ocfs2_schedule_truncate_log_flush(osb, 1);
7090
7091         if (tl_sem)
7092                 mutex_unlock(&tl_inode->i_mutex);
7093
7094         if (handle)
7095                 ocfs2_commit_trans(osb, handle);
7096
7097         ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7098
7099         ocfs2_free_path(path);
7100
7101         /* This will drop the ext_alloc cluster lock for us */
7102         ocfs2_free_truncate_context(tc);
7103
7104         mlog_exit(status);
7105         return status;
7106 }
7107
7108 /*
7109  * Expects the inode to already be locked.
7110  */
7111 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
7112                            struct inode *inode,
7113                            struct buffer_head *fe_bh,
7114                            struct ocfs2_truncate_context **tc)
7115 {
7116         int status;
7117         unsigned int new_i_clusters;
7118         struct ocfs2_dinode *fe;
7119         struct ocfs2_extent_block *eb;
7120         struct buffer_head *last_eb_bh = NULL;
7121
7122         mlog_entry_void();
7123
7124         *tc = NULL;
7125
7126         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
7127                                                   i_size_read(inode));
7128         fe = (struct ocfs2_dinode *) fe_bh->b_data;
7129
7130         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
7131              "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
7132              (unsigned long long)le64_to_cpu(fe->i_size));
7133
7134         *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
7135         if (!(*tc)) {
7136                 status = -ENOMEM;
7137                 mlog_errno(status);
7138                 goto bail;
7139         }
7140         ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7141
7142         if (fe->id2.i_list.l_tree_depth) {
7143                 status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
7144                                           &last_eb_bh);
7145                 if (status < 0) {
7146                         mlog_errno(status);
7147                         goto bail;
7148                 }
7149                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
7150                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
7151                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
7152
7153                         brelse(last_eb_bh);
7154                         status = -EIO;
7155                         goto bail;
7156                 }
7157         }
7158
7159         (*tc)->tc_last_eb_bh = last_eb_bh;
7160
7161         status = 0;
7162 bail:
7163         if (status < 0) {
7164                 if (*tc)
7165                         ocfs2_free_truncate_context(*tc);
7166                 *tc = NULL;
7167         }
7168         mlog_exit_void();
7169         return status;
7170 }
7171
7172 /*
7173  * 'start' is inclusive, 'end' is not.
7174  */
7175 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7176                           unsigned int start, unsigned int end, int trunc)
7177 {
7178         int ret;
7179         unsigned int numbytes;
7180         handle_t *handle;
7181         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7182         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7183         struct ocfs2_inline_data *idata = &di->id2.i_data;
7184
7185         if (end > i_size_read(inode))
7186                 end = i_size_read(inode);
7187
7188         BUG_ON(start >= end);
7189
7190         if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7191             !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7192             !ocfs2_supports_inline_data(osb)) {
7193                 ocfs2_error(inode->i_sb,
7194                             "Inline data flags for inode %llu don't agree! "
7195                             "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7196                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7197                             le16_to_cpu(di->i_dyn_features),
7198                             OCFS2_I(inode)->ip_dyn_features,
7199                             osb->s_feature_incompat);
7200                 ret = -EROFS;
7201                 goto out;
7202         }
7203
7204         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7205         if (IS_ERR(handle)) {
7206                 ret = PTR_ERR(handle);
7207                 mlog_errno(ret);
7208                 goto out;
7209         }
7210
7211         ret = ocfs2_journal_access(handle, inode, di_bh,
7212                                    OCFS2_JOURNAL_ACCESS_WRITE);
7213         if (ret) {
7214                 mlog_errno(ret);
7215                 goto out_commit;
7216         }
7217
7218         numbytes = end - start;
7219         memset(idata->id_data + start, 0, numbytes);
7220
7221         /*
7222          * No need to worry about the data page here - it's been
7223          * truncated already and inline data doesn't need it for
7224          * pushing zero's to disk, so we'll let readpage pick it up
7225          * later.
7226          */
7227         if (trunc) {
7228                 i_size_write(inode, start);
7229                 di->i_size = cpu_to_le64(start);
7230         }
7231
7232         inode->i_blocks = ocfs2_inode_sector_count(inode);
7233         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7234
7235         di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7236         di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7237
7238         ocfs2_journal_dirty(handle, di_bh);
7239
7240 out_commit:
7241         ocfs2_commit_trans(osb, handle);
7242
7243 out:
7244         return ret;
7245 }
7246
7247 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7248 {
7249         /*
7250          * The caller is responsible for completing deallocation
7251          * before freeing the context.
7252          */
7253         if (tc->tc_dealloc.c_first_suballocator != NULL)
7254                 mlog(ML_NOTICE,
7255                      "Truncate completion has non-empty dealloc context\n");
7256
7257         brelse(tc->tc_last_eb_bh);
7258
7259         kfree(tc);
7260 }