ocfs2: Add extent tree operation for xattr value btrees
[safe/jmp/linux-2.6] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "aops.h"
39 #include "dlmglue.h"
40 #include "extent_map.h"
41 #include "inode.h"
42 #include "journal.h"
43 #include "localalloc.h"
44 #include "suballoc.h"
45 #include "sysfile.h"
46 #include "file.h"
47 #include "super.h"
48 #include "uptodate.h"
49
50 #include "buffer_head_io.h"
51
52 /*
53  * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
54  * the b-tree operations in ocfs2. Now all the b-tree operations are not
55  * limited to ocfs2_dinode only. Any data which need to allocate clusters
56  * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
57  * and operation.
58  *
59  * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
60  * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
61  * functions.
62  * ocfs2_extent_tree_operations abstract the normal operations we do for
63  * the root of extent b-tree.
64  */
65 struct ocfs2_extent_tree;
66
67 struct ocfs2_extent_tree_operations {
68         void (*set_last_eb_blk) (struct ocfs2_extent_tree *et, u64 blkno);
69         u64 (*get_last_eb_blk) (struct ocfs2_extent_tree *et);
70         void (*update_clusters) (struct inode *inode,
71                                  struct ocfs2_extent_tree *et,
72                                  u32 new_clusters);
73         int (*sanity_check) (struct inode *inode, struct ocfs2_extent_tree *et);
74 };
75
76 struct ocfs2_extent_tree {
77         enum ocfs2_extent_tree_type type;
78         struct ocfs2_extent_tree_operations *eops;
79         struct buffer_head *root_bh;
80         struct ocfs2_extent_list *root_el;
81         void *private;
82 };
83
84 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
85                                          u64 blkno)
86 {
87         struct ocfs2_dinode *di = (struct ocfs2_dinode *)et->root_bh->b_data;
88
89         BUG_ON(et->type != OCFS2_DINODE_EXTENT);
90         di->i_last_eb_blk = cpu_to_le64(blkno);
91 }
92
93 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
94 {
95         struct ocfs2_dinode *di = (struct ocfs2_dinode *)et->root_bh->b_data;
96
97         BUG_ON(et->type != OCFS2_DINODE_EXTENT);
98         return le64_to_cpu(di->i_last_eb_blk);
99 }
100
101 static void ocfs2_dinode_update_clusters(struct inode *inode,
102                                          struct ocfs2_extent_tree *et,
103                                          u32 clusters)
104 {
105         struct ocfs2_dinode *di =
106                         (struct ocfs2_dinode *)et->root_bh->b_data;
107
108         le32_add_cpu(&di->i_clusters, clusters);
109         spin_lock(&OCFS2_I(inode)->ip_lock);
110         OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
111         spin_unlock(&OCFS2_I(inode)->ip_lock);
112 }
113
114 static int ocfs2_dinode_sanity_check(struct inode *inode,
115                                      struct ocfs2_extent_tree *et)
116 {
117         int ret = 0;
118         struct ocfs2_dinode *di;
119
120         BUG_ON(et->type != OCFS2_DINODE_EXTENT);
121
122         di = (struct ocfs2_dinode *)et->root_bh->b_data;
123         if (!OCFS2_IS_VALID_DINODE(di)) {
124                 ret = -EIO;
125                 ocfs2_error(inode->i_sb,
126                         "Inode %llu has invalid path root",
127                         (unsigned long long)OCFS2_I(inode)->ip_blkno);
128         }
129
130         return ret;
131 }
132
133 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
134         .set_last_eb_blk        = ocfs2_dinode_set_last_eb_blk,
135         .get_last_eb_blk        = ocfs2_dinode_get_last_eb_blk,
136         .update_clusters        = ocfs2_dinode_update_clusters,
137         .sanity_check           = ocfs2_dinode_sanity_check,
138 };
139
140 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
141                                               u64 blkno)
142 {
143         struct ocfs2_xattr_value_root *xv =
144                 (struct ocfs2_xattr_value_root *)et->private;
145
146         xv->xr_last_eb_blk = cpu_to_le64(blkno);
147 }
148
149 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
150 {
151         struct ocfs2_xattr_value_root *xv =
152                 (struct ocfs2_xattr_value_root *) et->private;
153
154         return le64_to_cpu(xv->xr_last_eb_blk);
155 }
156
157 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
158                                               struct ocfs2_extent_tree *et,
159                                               u32 clusters)
160 {
161         struct ocfs2_xattr_value_root *xv =
162                 (struct ocfs2_xattr_value_root *)et->private;
163
164         le32_add_cpu(&xv->xr_clusters, clusters);
165 }
166
167 static int ocfs2_xattr_value_sanity_check(struct inode *inode,
168                                           struct ocfs2_extent_tree *et)
169 {
170         return 0;
171 }
172
173 static struct ocfs2_extent_tree_operations ocfs2_xattr_et_ops = {
174         .set_last_eb_blk        = ocfs2_xattr_value_set_last_eb_blk,
175         .get_last_eb_blk        = ocfs2_xattr_value_get_last_eb_blk,
176         .update_clusters        = ocfs2_xattr_value_update_clusters,
177         .sanity_check           = ocfs2_xattr_value_sanity_check,
178 };
179
180 static struct ocfs2_extent_tree*
181          ocfs2_new_extent_tree(struct buffer_head *bh,
182                                enum ocfs2_extent_tree_type et_type,
183                                void *private)
184 {
185         struct ocfs2_extent_tree *et;
186
187         et = kzalloc(sizeof(*et), GFP_NOFS);
188         if (!et)
189                 return NULL;
190
191         et->type = et_type;
192         get_bh(bh);
193         et->root_bh = bh;
194         et->private = private;
195
196         if (et_type == OCFS2_DINODE_EXTENT) {
197                 et->root_el = &((struct ocfs2_dinode *)bh->b_data)->id2.i_list;
198                 et->eops = &ocfs2_dinode_et_ops;
199         } else if (et_type == OCFS2_XATTR_VALUE_EXTENT) {
200                 struct ocfs2_xattr_value_root *xv =
201                         (struct ocfs2_xattr_value_root *) private;
202                 et->root_el = &xv->xr_list;
203                 et->eops = &ocfs2_xattr_et_ops;
204         }
205
206         return et;
207 }
208
209 static void ocfs2_free_extent_tree(struct ocfs2_extent_tree *et)
210 {
211         if (et) {
212                 brelse(et->root_bh);
213                 kfree(et);
214         }
215 }
216
217 static inline void ocfs2_set_last_eb_blk(struct ocfs2_extent_tree *et,
218                                          u64 new_last_eb_blk)
219 {
220         et->eops->set_last_eb_blk(et, new_last_eb_blk);
221 }
222
223 static inline u64 ocfs2_get_last_eb_blk(struct ocfs2_extent_tree *et)
224 {
225         return et->eops->get_last_eb_blk(et);
226 }
227
228 static inline void ocfs2_update_clusters(struct inode *inode,
229                                          struct ocfs2_extent_tree *et,
230                                          u32 clusters)
231 {
232         et->eops->update_clusters(inode, et, clusters);
233 }
234
235 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
236 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
237                                          struct ocfs2_extent_block *eb);
238
239 /*
240  * Structures which describe a path through a btree, and functions to
241  * manipulate them.
242  *
243  * The idea here is to be as generic as possible with the tree
244  * manipulation code.
245  */
246 struct ocfs2_path_item {
247         struct buffer_head              *bh;
248         struct ocfs2_extent_list        *el;
249 };
250
251 #define OCFS2_MAX_PATH_DEPTH    5
252
253 struct ocfs2_path {
254         int                     p_tree_depth;
255         struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
256 };
257
258 #define path_root_bh(_path) ((_path)->p_node[0].bh)
259 #define path_root_el(_path) ((_path)->p_node[0].el)
260 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
261 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
262 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
263
264 /*
265  * Reset the actual path elements so that we can re-use the structure
266  * to build another path. Generally, this involves freeing the buffer
267  * heads.
268  */
269 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
270 {
271         int i, start = 0, depth = 0;
272         struct ocfs2_path_item *node;
273
274         if (keep_root)
275                 start = 1;
276
277         for(i = start; i < path_num_items(path); i++) {
278                 node = &path->p_node[i];
279
280                 brelse(node->bh);
281                 node->bh = NULL;
282                 node->el = NULL;
283         }
284
285         /*
286          * Tree depth may change during truncate, or insert. If we're
287          * keeping the root extent list, then make sure that our path
288          * structure reflects the proper depth.
289          */
290         if (keep_root)
291                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
292
293         path->p_tree_depth = depth;
294 }
295
296 static void ocfs2_free_path(struct ocfs2_path *path)
297 {
298         if (path) {
299                 ocfs2_reinit_path(path, 0);
300                 kfree(path);
301         }
302 }
303
304 /*
305  * All the elements of src into dest. After this call, src could be freed
306  * without affecting dest.
307  *
308  * Both paths should have the same root. Any non-root elements of dest
309  * will be freed.
310  */
311 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
312 {
313         int i;
314
315         BUG_ON(path_root_bh(dest) != path_root_bh(src));
316         BUG_ON(path_root_el(dest) != path_root_el(src));
317
318         ocfs2_reinit_path(dest, 1);
319
320         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
321                 dest->p_node[i].bh = src->p_node[i].bh;
322                 dest->p_node[i].el = src->p_node[i].el;
323
324                 if (dest->p_node[i].bh)
325                         get_bh(dest->p_node[i].bh);
326         }
327 }
328
329 /*
330  * Make the *dest path the same as src and re-initialize src path to
331  * have a root only.
332  */
333 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
334 {
335         int i;
336
337         BUG_ON(path_root_bh(dest) != path_root_bh(src));
338
339         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
340                 brelse(dest->p_node[i].bh);
341
342                 dest->p_node[i].bh = src->p_node[i].bh;
343                 dest->p_node[i].el = src->p_node[i].el;
344
345                 src->p_node[i].bh = NULL;
346                 src->p_node[i].el = NULL;
347         }
348 }
349
350 /*
351  * Insert an extent block at given index.
352  *
353  * This will not take an additional reference on eb_bh.
354  */
355 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
356                                         struct buffer_head *eb_bh)
357 {
358         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
359
360         /*
361          * Right now, no root bh is an extent block, so this helps
362          * catch code errors with dinode trees. The assertion can be
363          * safely removed if we ever need to insert extent block
364          * structures at the root.
365          */
366         BUG_ON(index == 0);
367
368         path->p_node[index].bh = eb_bh;
369         path->p_node[index].el = &eb->h_list;
370 }
371
372 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
373                                          struct ocfs2_extent_list *root_el)
374 {
375         struct ocfs2_path *path;
376
377         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
378
379         path = kzalloc(sizeof(*path), GFP_NOFS);
380         if (path) {
381                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
382                 get_bh(root_bh);
383                 path_root_bh(path) = root_bh;
384                 path_root_el(path) = root_el;
385         }
386
387         return path;
388 }
389
390 /*
391  * Convenience function to journal all components in a path.
392  */
393 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
394                                      struct ocfs2_path *path)
395 {
396         int i, ret = 0;
397
398         if (!path)
399                 goto out;
400
401         for(i = 0; i < path_num_items(path); i++) {
402                 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
403                                            OCFS2_JOURNAL_ACCESS_WRITE);
404                 if (ret < 0) {
405                         mlog_errno(ret);
406                         goto out;
407                 }
408         }
409
410 out:
411         return ret;
412 }
413
414 /*
415  * Return the index of the extent record which contains cluster #v_cluster.
416  * -1 is returned if it was not found.
417  *
418  * Should work fine on interior and exterior nodes.
419  */
420 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
421 {
422         int ret = -1;
423         int i;
424         struct ocfs2_extent_rec *rec;
425         u32 rec_end, rec_start, clusters;
426
427         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
428                 rec = &el->l_recs[i];
429
430                 rec_start = le32_to_cpu(rec->e_cpos);
431                 clusters = ocfs2_rec_clusters(el, rec);
432
433                 rec_end = rec_start + clusters;
434
435                 if (v_cluster >= rec_start && v_cluster < rec_end) {
436                         ret = i;
437                         break;
438                 }
439         }
440
441         return ret;
442 }
443
444 enum ocfs2_contig_type {
445         CONTIG_NONE = 0,
446         CONTIG_LEFT,
447         CONTIG_RIGHT,
448         CONTIG_LEFTRIGHT,
449 };
450
451
452 /*
453  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
454  * ocfs2_extent_contig only work properly against leaf nodes!
455  */
456 static int ocfs2_block_extent_contig(struct super_block *sb,
457                                      struct ocfs2_extent_rec *ext,
458                                      u64 blkno)
459 {
460         u64 blk_end = le64_to_cpu(ext->e_blkno);
461
462         blk_end += ocfs2_clusters_to_blocks(sb,
463                                     le16_to_cpu(ext->e_leaf_clusters));
464
465         return blkno == blk_end;
466 }
467
468 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
469                                   struct ocfs2_extent_rec *right)
470 {
471         u32 left_range;
472
473         left_range = le32_to_cpu(left->e_cpos) +
474                 le16_to_cpu(left->e_leaf_clusters);
475
476         return (left_range == le32_to_cpu(right->e_cpos));
477 }
478
479 static enum ocfs2_contig_type
480         ocfs2_extent_contig(struct inode *inode,
481                             struct ocfs2_extent_rec *ext,
482                             struct ocfs2_extent_rec *insert_rec)
483 {
484         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
485
486         /*
487          * Refuse to coalesce extent records with different flag
488          * fields - we don't want to mix unwritten extents with user
489          * data.
490          */
491         if (ext->e_flags != insert_rec->e_flags)
492                 return CONTIG_NONE;
493
494         if (ocfs2_extents_adjacent(ext, insert_rec) &&
495             ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
496                         return CONTIG_RIGHT;
497
498         blkno = le64_to_cpu(ext->e_blkno);
499         if (ocfs2_extents_adjacent(insert_rec, ext) &&
500             ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
501                 return CONTIG_LEFT;
502
503         return CONTIG_NONE;
504 }
505
506 /*
507  * NOTE: We can have pretty much any combination of contiguousness and
508  * appending.
509  *
510  * The usefulness of APPEND_TAIL is more in that it lets us know that
511  * we'll have to update the path to that leaf.
512  */
513 enum ocfs2_append_type {
514         APPEND_NONE = 0,
515         APPEND_TAIL,
516 };
517
518 enum ocfs2_split_type {
519         SPLIT_NONE = 0,
520         SPLIT_LEFT,
521         SPLIT_RIGHT,
522 };
523
524 struct ocfs2_insert_type {
525         enum ocfs2_split_type   ins_split;
526         enum ocfs2_append_type  ins_appending;
527         enum ocfs2_contig_type  ins_contig;
528         int                     ins_contig_index;
529         int                     ins_tree_depth;
530 };
531
532 struct ocfs2_merge_ctxt {
533         enum ocfs2_contig_type  c_contig_type;
534         int                     c_has_empty_extent;
535         int                     c_split_covers_rec;
536 };
537
538 /*
539  * How many free extents have we got before we need more meta data?
540  */
541 int ocfs2_num_free_extents(struct ocfs2_super *osb,
542                            struct inode *inode,
543                            struct buffer_head *root_bh,
544                            enum ocfs2_extent_tree_type type,
545                            void *private)
546 {
547         int retval;
548         struct ocfs2_extent_list *el = NULL;
549         struct ocfs2_extent_block *eb;
550         struct buffer_head *eb_bh = NULL;
551         u64 last_eb_blk = 0;
552
553         mlog_entry_void();
554
555         if (type == OCFS2_DINODE_EXTENT) {
556                 struct ocfs2_dinode *fe =
557                                 (struct ocfs2_dinode *)root_bh->b_data;
558                 if (!OCFS2_IS_VALID_DINODE(fe)) {
559                         OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
560                         retval = -EIO;
561                         goto bail;
562                 }
563
564                 if (fe->i_last_eb_blk)
565                         last_eb_blk = le64_to_cpu(fe->i_last_eb_blk);
566                 el = &fe->id2.i_list;
567         } else if (type == OCFS2_XATTR_VALUE_EXTENT) {
568                 struct ocfs2_xattr_value_root *xv =
569                         (struct ocfs2_xattr_value_root *) private;
570
571                 last_eb_blk = le64_to_cpu(xv->xr_last_eb_blk);
572                 el = &xv->xr_list;
573         }
574
575         if (last_eb_blk) {
576                 retval = ocfs2_read_block(osb, last_eb_blk,
577                                           &eb_bh, OCFS2_BH_CACHED, inode);
578                 if (retval < 0) {
579                         mlog_errno(retval);
580                         goto bail;
581                 }
582                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
583                 el = &eb->h_list;
584         }
585
586         BUG_ON(el->l_tree_depth != 0);
587
588         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
589 bail:
590         if (eb_bh)
591                 brelse(eb_bh);
592
593         mlog_exit(retval);
594         return retval;
595 }
596
597 /* expects array to already be allocated
598  *
599  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
600  * l_count for you
601  */
602 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
603                                      handle_t *handle,
604                                      struct inode *inode,
605                                      int wanted,
606                                      struct ocfs2_alloc_context *meta_ac,
607                                      struct buffer_head *bhs[])
608 {
609         int count, status, i;
610         u16 suballoc_bit_start;
611         u32 num_got;
612         u64 first_blkno;
613         struct ocfs2_extent_block *eb;
614
615         mlog_entry_void();
616
617         count = 0;
618         while (count < wanted) {
619                 status = ocfs2_claim_metadata(osb,
620                                               handle,
621                                               meta_ac,
622                                               wanted - count,
623                                               &suballoc_bit_start,
624                                               &num_got,
625                                               &first_blkno);
626                 if (status < 0) {
627                         mlog_errno(status);
628                         goto bail;
629                 }
630
631                 for(i = count;  i < (num_got + count); i++) {
632                         bhs[i] = sb_getblk(osb->sb, first_blkno);
633                         if (bhs[i] == NULL) {
634                                 status = -EIO;
635                                 mlog_errno(status);
636                                 goto bail;
637                         }
638                         ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
639
640                         status = ocfs2_journal_access(handle, inode, bhs[i],
641                                                       OCFS2_JOURNAL_ACCESS_CREATE);
642                         if (status < 0) {
643                                 mlog_errno(status);
644                                 goto bail;
645                         }
646
647                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
648                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
649                         /* Ok, setup the minimal stuff here. */
650                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
651                         eb->h_blkno = cpu_to_le64(first_blkno);
652                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
653                         eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
654                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
655                         eb->h_list.l_count =
656                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
657
658                         suballoc_bit_start++;
659                         first_blkno++;
660
661                         /* We'll also be dirtied by the caller, so
662                          * this isn't absolutely necessary. */
663                         status = ocfs2_journal_dirty(handle, bhs[i]);
664                         if (status < 0) {
665                                 mlog_errno(status);
666                                 goto bail;
667                         }
668                 }
669
670                 count += num_got;
671         }
672
673         status = 0;
674 bail:
675         if (status < 0) {
676                 for(i = 0; i < wanted; i++) {
677                         if (bhs[i])
678                                 brelse(bhs[i]);
679                         bhs[i] = NULL;
680                 }
681         }
682         mlog_exit(status);
683         return status;
684 }
685
686 /*
687  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
688  *
689  * Returns the sum of the rightmost extent rec logical offset and
690  * cluster count.
691  *
692  * ocfs2_add_branch() uses this to determine what logical cluster
693  * value should be populated into the leftmost new branch records.
694  *
695  * ocfs2_shift_tree_depth() uses this to determine the # clusters
696  * value for the new topmost tree record.
697  */
698 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
699 {
700         int i;
701
702         i = le16_to_cpu(el->l_next_free_rec) - 1;
703
704         return le32_to_cpu(el->l_recs[i].e_cpos) +
705                 ocfs2_rec_clusters(el, &el->l_recs[i]);
706 }
707
708 /*
709  * Add an entire tree branch to our inode. eb_bh is the extent block
710  * to start at, if we don't want to start the branch at the dinode
711  * structure.
712  *
713  * last_eb_bh is required as we have to update it's next_leaf pointer
714  * for the new last extent block.
715  *
716  * the new branch will be 'empty' in the sense that every block will
717  * contain a single record with cluster count == 0.
718  */
719 static int ocfs2_add_branch(struct ocfs2_super *osb,
720                             handle_t *handle,
721                             struct inode *inode,
722                             struct ocfs2_extent_tree *et,
723                             struct buffer_head *eb_bh,
724                             struct buffer_head **last_eb_bh,
725                             struct ocfs2_alloc_context *meta_ac)
726 {
727         int status, new_blocks, i;
728         u64 next_blkno, new_last_eb_blk;
729         struct buffer_head *bh;
730         struct buffer_head **new_eb_bhs = NULL;
731         struct ocfs2_extent_block *eb;
732         struct ocfs2_extent_list  *eb_el;
733         struct ocfs2_extent_list  *el;
734         u32 new_cpos;
735
736         mlog_entry_void();
737
738         BUG_ON(!last_eb_bh || !*last_eb_bh);
739
740         if (eb_bh) {
741                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
742                 el = &eb->h_list;
743         } else
744                 el = et->root_el;
745
746         /* we never add a branch to a leaf. */
747         BUG_ON(!el->l_tree_depth);
748
749         new_blocks = le16_to_cpu(el->l_tree_depth);
750
751         /* allocate the number of new eb blocks we need */
752         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
753                              GFP_KERNEL);
754         if (!new_eb_bhs) {
755                 status = -ENOMEM;
756                 mlog_errno(status);
757                 goto bail;
758         }
759
760         status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
761                                            meta_ac, new_eb_bhs);
762         if (status < 0) {
763                 mlog_errno(status);
764                 goto bail;
765         }
766
767         eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
768         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
769
770         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
771          * linked with the rest of the tree.
772          * conversly, new_eb_bhs[0] is the new bottommost leaf.
773          *
774          * when we leave the loop, new_last_eb_blk will point to the
775          * newest leaf, and next_blkno will point to the topmost extent
776          * block. */
777         next_blkno = new_last_eb_blk = 0;
778         for(i = 0; i < new_blocks; i++) {
779                 bh = new_eb_bhs[i];
780                 eb = (struct ocfs2_extent_block *) bh->b_data;
781                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
782                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
783                         status = -EIO;
784                         goto bail;
785                 }
786                 eb_el = &eb->h_list;
787
788                 status = ocfs2_journal_access(handle, inode, bh,
789                                               OCFS2_JOURNAL_ACCESS_CREATE);
790                 if (status < 0) {
791                         mlog_errno(status);
792                         goto bail;
793                 }
794
795                 eb->h_next_leaf_blk = 0;
796                 eb_el->l_tree_depth = cpu_to_le16(i);
797                 eb_el->l_next_free_rec = cpu_to_le16(1);
798                 /*
799                  * This actually counts as an empty extent as
800                  * c_clusters == 0
801                  */
802                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
803                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
804                 /*
805                  * eb_el isn't always an interior node, but even leaf
806                  * nodes want a zero'd flags and reserved field so
807                  * this gets the whole 32 bits regardless of use.
808                  */
809                 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
810                 if (!eb_el->l_tree_depth)
811                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
812
813                 status = ocfs2_journal_dirty(handle, bh);
814                 if (status < 0) {
815                         mlog_errno(status);
816                         goto bail;
817                 }
818
819                 next_blkno = le64_to_cpu(eb->h_blkno);
820         }
821
822         /* This is a bit hairy. We want to update up to three blocks
823          * here without leaving any of them in an inconsistent state
824          * in case of error. We don't have to worry about
825          * journal_dirty erroring as it won't unless we've aborted the
826          * handle (in which case we would never be here) so reserving
827          * the write with journal_access is all we need to do. */
828         status = ocfs2_journal_access(handle, inode, *last_eb_bh,
829                                       OCFS2_JOURNAL_ACCESS_WRITE);
830         if (status < 0) {
831                 mlog_errno(status);
832                 goto bail;
833         }
834         status = ocfs2_journal_access(handle, inode, et->root_bh,
835                                       OCFS2_JOURNAL_ACCESS_WRITE);
836         if (status < 0) {
837                 mlog_errno(status);
838                 goto bail;
839         }
840         if (eb_bh) {
841                 status = ocfs2_journal_access(handle, inode, eb_bh,
842                                               OCFS2_JOURNAL_ACCESS_WRITE);
843                 if (status < 0) {
844                         mlog_errno(status);
845                         goto bail;
846                 }
847         }
848
849         /* Link the new branch into the rest of the tree (el will
850          * either be on the root_bh, or the extent block passed in. */
851         i = le16_to_cpu(el->l_next_free_rec);
852         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
853         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
854         el->l_recs[i].e_int_clusters = 0;
855         le16_add_cpu(&el->l_next_free_rec, 1);
856
857         /* fe needs a new last extent block pointer, as does the
858          * next_leaf on the previously last-extent-block. */
859         ocfs2_set_last_eb_blk(et, new_last_eb_blk);
860
861         eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
862         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
863
864         status = ocfs2_journal_dirty(handle, *last_eb_bh);
865         if (status < 0)
866                 mlog_errno(status);
867         status = ocfs2_journal_dirty(handle, et->root_bh);
868         if (status < 0)
869                 mlog_errno(status);
870         if (eb_bh) {
871                 status = ocfs2_journal_dirty(handle, eb_bh);
872                 if (status < 0)
873                         mlog_errno(status);
874         }
875
876         /*
877          * Some callers want to track the rightmost leaf so pass it
878          * back here.
879          */
880         brelse(*last_eb_bh);
881         get_bh(new_eb_bhs[0]);
882         *last_eb_bh = new_eb_bhs[0];
883
884         status = 0;
885 bail:
886         if (new_eb_bhs) {
887                 for (i = 0; i < new_blocks; i++)
888                         if (new_eb_bhs[i])
889                                 brelse(new_eb_bhs[i]);
890                 kfree(new_eb_bhs);
891         }
892
893         mlog_exit(status);
894         return status;
895 }
896
897 /*
898  * adds another level to the allocation tree.
899  * returns back the new extent block so you can add a branch to it
900  * after this call.
901  */
902 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
903                                   handle_t *handle,
904                                   struct inode *inode,
905                                   struct ocfs2_extent_tree *et,
906                                   struct ocfs2_alloc_context *meta_ac,
907                                   struct buffer_head **ret_new_eb_bh)
908 {
909         int status, i;
910         u32 new_clusters;
911         struct buffer_head *new_eb_bh = NULL;
912         struct ocfs2_extent_block *eb;
913         struct ocfs2_extent_list  *root_el;
914         struct ocfs2_extent_list  *eb_el;
915
916         mlog_entry_void();
917
918         status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
919                                            &new_eb_bh);
920         if (status < 0) {
921                 mlog_errno(status);
922                 goto bail;
923         }
924
925         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
926         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
927                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
928                 status = -EIO;
929                 goto bail;
930         }
931
932         eb_el = &eb->h_list;
933         root_el = et->root_el;
934
935         status = ocfs2_journal_access(handle, inode, new_eb_bh,
936                                       OCFS2_JOURNAL_ACCESS_CREATE);
937         if (status < 0) {
938                 mlog_errno(status);
939                 goto bail;
940         }
941
942         /* copy the root extent list data into the new extent block */
943         eb_el->l_tree_depth = root_el->l_tree_depth;
944         eb_el->l_next_free_rec = root_el->l_next_free_rec;
945         for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
946                 eb_el->l_recs[i] = root_el->l_recs[i];
947
948         status = ocfs2_journal_dirty(handle, new_eb_bh);
949         if (status < 0) {
950                 mlog_errno(status);
951                 goto bail;
952         }
953
954         status = ocfs2_journal_access(handle, inode, et->root_bh,
955                                       OCFS2_JOURNAL_ACCESS_WRITE);
956         if (status < 0) {
957                 mlog_errno(status);
958                 goto bail;
959         }
960
961         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
962
963         /* update root_bh now */
964         le16_add_cpu(&root_el->l_tree_depth, 1);
965         root_el->l_recs[0].e_cpos = 0;
966         root_el->l_recs[0].e_blkno = eb->h_blkno;
967         root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
968         for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
969                 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
970         root_el->l_next_free_rec = cpu_to_le16(1);
971
972         /* If this is our 1st tree depth shift, then last_eb_blk
973          * becomes the allocated extent block */
974         if (root_el->l_tree_depth == cpu_to_le16(1))
975                 ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
976
977         status = ocfs2_journal_dirty(handle, et->root_bh);
978         if (status < 0) {
979                 mlog_errno(status);
980                 goto bail;
981         }
982
983         *ret_new_eb_bh = new_eb_bh;
984         new_eb_bh = NULL;
985         status = 0;
986 bail:
987         if (new_eb_bh)
988                 brelse(new_eb_bh);
989
990         mlog_exit(status);
991         return status;
992 }
993
994 /*
995  * Should only be called when there is no space left in any of the
996  * leaf nodes. What we want to do is find the lowest tree depth
997  * non-leaf extent block with room for new records. There are three
998  * valid results of this search:
999  *
1000  * 1) a lowest extent block is found, then we pass it back in
1001  *    *lowest_eb_bh and return '0'
1002  *
1003  * 2) the search fails to find anything, but the root_el has room. We
1004  *    pass NULL back in *lowest_eb_bh, but still return '0'
1005  *
1006  * 3) the search fails to find anything AND the root_el is full, in
1007  *    which case we return > 0
1008  *
1009  * return status < 0 indicates an error.
1010  */
1011 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1012                                     struct inode *inode,
1013                                     struct ocfs2_extent_tree *et,
1014                                     struct buffer_head **target_bh)
1015 {
1016         int status = 0, i;
1017         u64 blkno;
1018         struct ocfs2_extent_block *eb;
1019         struct ocfs2_extent_list  *el;
1020         struct buffer_head *bh = NULL;
1021         struct buffer_head *lowest_bh = NULL;
1022
1023         mlog_entry_void();
1024
1025         *target_bh = NULL;
1026
1027         el = et->root_el;
1028
1029         while(le16_to_cpu(el->l_tree_depth) > 1) {
1030                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1031                         ocfs2_error(inode->i_sb, "Dinode %llu has empty "
1032                                     "extent list (next_free_rec == 0)",
1033                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
1034                         status = -EIO;
1035                         goto bail;
1036                 }
1037                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1038                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1039                 if (!blkno) {
1040                         ocfs2_error(inode->i_sb, "Dinode %llu has extent "
1041                                     "list where extent # %d has no physical "
1042                                     "block start",
1043                                     (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
1044                         status = -EIO;
1045                         goto bail;
1046                 }
1047
1048                 if (bh) {
1049                         brelse(bh);
1050                         bh = NULL;
1051                 }
1052
1053                 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
1054                                           inode);
1055                 if (status < 0) {
1056                         mlog_errno(status);
1057                         goto bail;
1058                 }
1059
1060                 eb = (struct ocfs2_extent_block *) bh->b_data;
1061                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1062                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1063                         status = -EIO;
1064                         goto bail;
1065                 }
1066                 el = &eb->h_list;
1067
1068                 if (le16_to_cpu(el->l_next_free_rec) <
1069                     le16_to_cpu(el->l_count)) {
1070                         if (lowest_bh)
1071                                 brelse(lowest_bh);
1072                         lowest_bh = bh;
1073                         get_bh(lowest_bh);
1074                 }
1075         }
1076
1077         /* If we didn't find one and the fe doesn't have any room,
1078          * then return '1' */
1079         el = et->root_el;
1080         if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1081                 status = 1;
1082
1083         *target_bh = lowest_bh;
1084 bail:
1085         if (bh)
1086                 brelse(bh);
1087
1088         mlog_exit(status);
1089         return status;
1090 }
1091
1092 /*
1093  * Grow a b-tree so that it has more records.
1094  *
1095  * We might shift the tree depth in which case existing paths should
1096  * be considered invalid.
1097  *
1098  * Tree depth after the grow is returned via *final_depth.
1099  *
1100  * *last_eb_bh will be updated by ocfs2_add_branch().
1101  */
1102 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1103                            struct ocfs2_extent_tree *et, int *final_depth,
1104                            struct buffer_head **last_eb_bh,
1105                            struct ocfs2_alloc_context *meta_ac)
1106 {
1107         int ret, shift;
1108         struct ocfs2_extent_list *el = et->root_el;
1109         int depth = le16_to_cpu(el->l_tree_depth);
1110         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1111         struct buffer_head *bh = NULL;
1112
1113         BUG_ON(meta_ac == NULL);
1114
1115         shift = ocfs2_find_branch_target(osb, inode, et, &bh);
1116         if (shift < 0) {
1117                 ret = shift;
1118                 mlog_errno(ret);
1119                 goto out;
1120         }
1121
1122         /* We traveled all the way to the bottom of the allocation tree
1123          * and didn't find room for any more extents - we need to add
1124          * another tree level */
1125         if (shift) {
1126                 BUG_ON(bh);
1127                 mlog(0, "need to shift tree depth (current = %d)\n", depth);
1128
1129                 /* ocfs2_shift_tree_depth will return us a buffer with
1130                  * the new extent block (so we can pass that to
1131                  * ocfs2_add_branch). */
1132                 ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
1133                                              meta_ac, &bh);
1134                 if (ret < 0) {
1135                         mlog_errno(ret);
1136                         goto out;
1137                 }
1138                 depth++;
1139                 if (depth == 1) {
1140                         /*
1141                          * Special case: we have room now if we shifted from
1142                          * tree_depth 0, so no more work needs to be done.
1143                          *
1144                          * We won't be calling add_branch, so pass
1145                          * back *last_eb_bh as the new leaf. At depth
1146                          * zero, it should always be null so there's
1147                          * no reason to brelse.
1148                          */
1149                         BUG_ON(*last_eb_bh);
1150                         get_bh(bh);
1151                         *last_eb_bh = bh;
1152                         goto out;
1153                 }
1154         }
1155
1156         /* call ocfs2_add_branch to add the final part of the tree with
1157          * the new data. */
1158         mlog(0, "add branch. bh = %p\n", bh);
1159         ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
1160                                meta_ac);
1161         if (ret < 0) {
1162                 mlog_errno(ret);
1163                 goto out;
1164         }
1165
1166 out:
1167         if (final_depth)
1168                 *final_depth = depth;
1169         brelse(bh);
1170         return ret;
1171 }
1172
1173 /*
1174  * This function will discard the rightmost extent record.
1175  */
1176 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1177 {
1178         int next_free = le16_to_cpu(el->l_next_free_rec);
1179         int count = le16_to_cpu(el->l_count);
1180         unsigned int num_bytes;
1181
1182         BUG_ON(!next_free);
1183         /* This will cause us to go off the end of our extent list. */
1184         BUG_ON(next_free >= count);
1185
1186         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1187
1188         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1189 }
1190
1191 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1192                               struct ocfs2_extent_rec *insert_rec)
1193 {
1194         int i, insert_index, next_free, has_empty, num_bytes;
1195         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1196         struct ocfs2_extent_rec *rec;
1197
1198         next_free = le16_to_cpu(el->l_next_free_rec);
1199         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1200
1201         BUG_ON(!next_free);
1202
1203         /* The tree code before us didn't allow enough room in the leaf. */
1204         BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1205
1206         /*
1207          * The easiest way to approach this is to just remove the
1208          * empty extent and temporarily decrement next_free.
1209          */
1210         if (has_empty) {
1211                 /*
1212                  * If next_free was 1 (only an empty extent), this
1213                  * loop won't execute, which is fine. We still want
1214                  * the decrement above to happen.
1215                  */
1216                 for(i = 0; i < (next_free - 1); i++)
1217                         el->l_recs[i] = el->l_recs[i+1];
1218
1219                 next_free--;
1220         }
1221
1222         /*
1223          * Figure out what the new record index should be.
1224          */
1225         for(i = 0; i < next_free; i++) {
1226                 rec = &el->l_recs[i];
1227
1228                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
1229                         break;
1230         }
1231         insert_index = i;
1232
1233         mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1234              insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1235
1236         BUG_ON(insert_index < 0);
1237         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1238         BUG_ON(insert_index > next_free);
1239
1240         /*
1241          * No need to memmove if we're just adding to the tail.
1242          */
1243         if (insert_index != next_free) {
1244                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
1245
1246                 num_bytes = next_free - insert_index;
1247                 num_bytes *= sizeof(struct ocfs2_extent_rec);
1248                 memmove(&el->l_recs[insert_index + 1],
1249                         &el->l_recs[insert_index],
1250                         num_bytes);
1251         }
1252
1253         /*
1254          * Either we had an empty extent, and need to re-increment or
1255          * there was no empty extent on a non full rightmost leaf node,
1256          * in which case we still need to increment.
1257          */
1258         next_free++;
1259         el->l_next_free_rec = cpu_to_le16(next_free);
1260         /*
1261          * Make sure none of the math above just messed up our tree.
1262          */
1263         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1264
1265         el->l_recs[insert_index] = *insert_rec;
1266
1267 }
1268
1269 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1270 {
1271         int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1272
1273         BUG_ON(num_recs == 0);
1274
1275         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1276                 num_recs--;
1277                 size = num_recs * sizeof(struct ocfs2_extent_rec);
1278                 memmove(&el->l_recs[0], &el->l_recs[1], size);
1279                 memset(&el->l_recs[num_recs], 0,
1280                        sizeof(struct ocfs2_extent_rec));
1281                 el->l_next_free_rec = cpu_to_le16(num_recs);
1282         }
1283 }
1284
1285 /*
1286  * Create an empty extent record .
1287  *
1288  * l_next_free_rec may be updated.
1289  *
1290  * If an empty extent already exists do nothing.
1291  */
1292 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1293 {
1294         int next_free = le16_to_cpu(el->l_next_free_rec);
1295
1296         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1297
1298         if (next_free == 0)
1299                 goto set_and_inc;
1300
1301         if (ocfs2_is_empty_extent(&el->l_recs[0]))
1302                 return;
1303
1304         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1305                         "Asked to create an empty extent in a full list:\n"
1306                         "count = %u, tree depth = %u",
1307                         le16_to_cpu(el->l_count),
1308                         le16_to_cpu(el->l_tree_depth));
1309
1310         ocfs2_shift_records_right(el);
1311
1312 set_and_inc:
1313         le16_add_cpu(&el->l_next_free_rec, 1);
1314         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1315 }
1316
1317 /*
1318  * For a rotation which involves two leaf nodes, the "root node" is
1319  * the lowest level tree node which contains a path to both leafs. This
1320  * resulting set of information can be used to form a complete "subtree"
1321  *
1322  * This function is passed two full paths from the dinode down to a
1323  * pair of adjacent leaves. It's task is to figure out which path
1324  * index contains the subtree root - this can be the root index itself
1325  * in a worst-case rotation.
1326  *
1327  * The array index of the subtree root is passed back.
1328  */
1329 static int ocfs2_find_subtree_root(struct inode *inode,
1330                                    struct ocfs2_path *left,
1331                                    struct ocfs2_path *right)
1332 {
1333         int i = 0;
1334
1335         /*
1336          * Check that the caller passed in two paths from the same tree.
1337          */
1338         BUG_ON(path_root_bh(left) != path_root_bh(right));
1339
1340         do {
1341                 i++;
1342
1343                 /*
1344                  * The caller didn't pass two adjacent paths.
1345                  */
1346                 mlog_bug_on_msg(i > left->p_tree_depth,
1347                                 "Inode %lu, left depth %u, right depth %u\n"
1348                                 "left leaf blk %llu, right leaf blk %llu\n",
1349                                 inode->i_ino, left->p_tree_depth,
1350                                 right->p_tree_depth,
1351                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1352                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1353         } while (left->p_node[i].bh->b_blocknr ==
1354                  right->p_node[i].bh->b_blocknr);
1355
1356         return i - 1;
1357 }
1358
1359 typedef void (path_insert_t)(void *, struct buffer_head *);
1360
1361 /*
1362  * Traverse a btree path in search of cpos, starting at root_el.
1363  *
1364  * This code can be called with a cpos larger than the tree, in which
1365  * case it will return the rightmost path.
1366  */
1367 static int __ocfs2_find_path(struct inode *inode,
1368                              struct ocfs2_extent_list *root_el, u32 cpos,
1369                              path_insert_t *func, void *data)
1370 {
1371         int i, ret = 0;
1372         u32 range;
1373         u64 blkno;
1374         struct buffer_head *bh = NULL;
1375         struct ocfs2_extent_block *eb;
1376         struct ocfs2_extent_list *el;
1377         struct ocfs2_extent_rec *rec;
1378         struct ocfs2_inode_info *oi = OCFS2_I(inode);
1379
1380         el = root_el;
1381         while (el->l_tree_depth) {
1382                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1383                         ocfs2_error(inode->i_sb,
1384                                     "Inode %llu has empty extent list at "
1385                                     "depth %u\n",
1386                                     (unsigned long long)oi->ip_blkno,
1387                                     le16_to_cpu(el->l_tree_depth));
1388                         ret = -EROFS;
1389                         goto out;
1390
1391                 }
1392
1393                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1394                         rec = &el->l_recs[i];
1395
1396                         /*
1397                          * In the case that cpos is off the allocation
1398                          * tree, this should just wind up returning the
1399                          * rightmost record.
1400                          */
1401                         range = le32_to_cpu(rec->e_cpos) +
1402                                 ocfs2_rec_clusters(el, rec);
1403                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1404                             break;
1405                 }
1406
1407                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1408                 if (blkno == 0) {
1409                         ocfs2_error(inode->i_sb,
1410                                     "Inode %llu has bad blkno in extent list "
1411                                     "at depth %u (index %d)\n",
1412                                     (unsigned long long)oi->ip_blkno,
1413                                     le16_to_cpu(el->l_tree_depth), i);
1414                         ret = -EROFS;
1415                         goto out;
1416                 }
1417
1418                 brelse(bh);
1419                 bh = NULL;
1420                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1421                                        &bh, OCFS2_BH_CACHED, inode);
1422                 if (ret) {
1423                         mlog_errno(ret);
1424                         goto out;
1425                 }
1426
1427                 eb = (struct ocfs2_extent_block *) bh->b_data;
1428                 el = &eb->h_list;
1429                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1430                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1431                         ret = -EIO;
1432                         goto out;
1433                 }
1434
1435                 if (le16_to_cpu(el->l_next_free_rec) >
1436                     le16_to_cpu(el->l_count)) {
1437                         ocfs2_error(inode->i_sb,
1438                                     "Inode %llu has bad count in extent list "
1439                                     "at block %llu (next free=%u, count=%u)\n",
1440                                     (unsigned long long)oi->ip_blkno,
1441                                     (unsigned long long)bh->b_blocknr,
1442                                     le16_to_cpu(el->l_next_free_rec),
1443                                     le16_to_cpu(el->l_count));
1444                         ret = -EROFS;
1445                         goto out;
1446                 }
1447
1448                 if (func)
1449                         func(data, bh);
1450         }
1451
1452 out:
1453         /*
1454          * Catch any trailing bh that the loop didn't handle.
1455          */
1456         brelse(bh);
1457
1458         return ret;
1459 }
1460
1461 /*
1462  * Given an initialized path (that is, it has a valid root extent
1463  * list), this function will traverse the btree in search of the path
1464  * which would contain cpos.
1465  *
1466  * The path traveled is recorded in the path structure.
1467  *
1468  * Note that this will not do any comparisons on leaf node extent
1469  * records, so it will work fine in the case that we just added a tree
1470  * branch.
1471  */
1472 struct find_path_data {
1473         int index;
1474         struct ocfs2_path *path;
1475 };
1476 static void find_path_ins(void *data, struct buffer_head *bh)
1477 {
1478         struct find_path_data *fp = data;
1479
1480         get_bh(bh);
1481         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1482         fp->index++;
1483 }
1484 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1485                            u32 cpos)
1486 {
1487         struct find_path_data data;
1488
1489         data.index = 1;
1490         data.path = path;
1491         return __ocfs2_find_path(inode, path_root_el(path), cpos,
1492                                  find_path_ins, &data);
1493 }
1494
1495 static void find_leaf_ins(void *data, struct buffer_head *bh)
1496 {
1497         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1498         struct ocfs2_extent_list *el = &eb->h_list;
1499         struct buffer_head **ret = data;
1500
1501         /* We want to retain only the leaf block. */
1502         if (le16_to_cpu(el->l_tree_depth) == 0) {
1503                 get_bh(bh);
1504                 *ret = bh;
1505         }
1506 }
1507 /*
1508  * Find the leaf block in the tree which would contain cpos. No
1509  * checking of the actual leaf is done.
1510  *
1511  * Some paths want to call this instead of allocating a path structure
1512  * and calling ocfs2_find_path().
1513  *
1514  * This function doesn't handle non btree extent lists.
1515  */
1516 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1517                     u32 cpos, struct buffer_head **leaf_bh)
1518 {
1519         int ret;
1520         struct buffer_head *bh = NULL;
1521
1522         ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1523         if (ret) {
1524                 mlog_errno(ret);
1525                 goto out;
1526         }
1527
1528         *leaf_bh = bh;
1529 out:
1530         return ret;
1531 }
1532
1533 /*
1534  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1535  *
1536  * Basically, we've moved stuff around at the bottom of the tree and
1537  * we need to fix up the extent records above the changes to reflect
1538  * the new changes.
1539  *
1540  * left_rec: the record on the left.
1541  * left_child_el: is the child list pointed to by left_rec
1542  * right_rec: the record to the right of left_rec
1543  * right_child_el: is the child list pointed to by right_rec
1544  *
1545  * By definition, this only works on interior nodes.
1546  */
1547 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1548                                   struct ocfs2_extent_list *left_child_el,
1549                                   struct ocfs2_extent_rec *right_rec,
1550                                   struct ocfs2_extent_list *right_child_el)
1551 {
1552         u32 left_clusters, right_end;
1553
1554         /*
1555          * Interior nodes never have holes. Their cpos is the cpos of
1556          * the leftmost record in their child list. Their cluster
1557          * count covers the full theoretical range of their child list
1558          * - the range between their cpos and the cpos of the record
1559          * immediately to their right.
1560          */
1561         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1562         if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1563                 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1564                 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1565         }
1566         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1567         left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1568
1569         /*
1570          * Calculate the rightmost cluster count boundary before
1571          * moving cpos - we will need to adjust clusters after
1572          * updating e_cpos to keep the same highest cluster count.
1573          */
1574         right_end = le32_to_cpu(right_rec->e_cpos);
1575         right_end += le32_to_cpu(right_rec->e_int_clusters);
1576
1577         right_rec->e_cpos = left_rec->e_cpos;
1578         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1579
1580         right_end -= le32_to_cpu(right_rec->e_cpos);
1581         right_rec->e_int_clusters = cpu_to_le32(right_end);
1582 }
1583
1584 /*
1585  * Adjust the adjacent root node records involved in a
1586  * rotation. left_el_blkno is passed in as a key so that we can easily
1587  * find it's index in the root list.
1588  */
1589 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1590                                       struct ocfs2_extent_list *left_el,
1591                                       struct ocfs2_extent_list *right_el,
1592                                       u64 left_el_blkno)
1593 {
1594         int i;
1595
1596         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1597                le16_to_cpu(left_el->l_tree_depth));
1598
1599         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1600                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1601                         break;
1602         }
1603
1604         /*
1605          * The path walking code should have never returned a root and
1606          * two paths which are not adjacent.
1607          */
1608         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1609
1610         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1611                                       &root_el->l_recs[i + 1], right_el);
1612 }
1613
1614 /*
1615  * We've changed a leaf block (in right_path) and need to reflect that
1616  * change back up the subtree.
1617  *
1618  * This happens in multiple places:
1619  *   - When we've moved an extent record from the left path leaf to the right
1620  *     path leaf to make room for an empty extent in the left path leaf.
1621  *   - When our insert into the right path leaf is at the leftmost edge
1622  *     and requires an update of the path immediately to it's left. This
1623  *     can occur at the end of some types of rotation and appending inserts.
1624  *   - When we've adjusted the last extent record in the left path leaf and the
1625  *     1st extent record in the right path leaf during cross extent block merge.
1626  */
1627 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1628                                        struct ocfs2_path *left_path,
1629                                        struct ocfs2_path *right_path,
1630                                        int subtree_index)
1631 {
1632         int ret, i, idx;
1633         struct ocfs2_extent_list *el, *left_el, *right_el;
1634         struct ocfs2_extent_rec *left_rec, *right_rec;
1635         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1636
1637         /*
1638          * Update the counts and position values within all the
1639          * interior nodes to reflect the leaf rotation we just did.
1640          *
1641          * The root node is handled below the loop.
1642          *
1643          * We begin the loop with right_el and left_el pointing to the
1644          * leaf lists and work our way up.
1645          *
1646          * NOTE: within this loop, left_el and right_el always refer
1647          * to the *child* lists.
1648          */
1649         left_el = path_leaf_el(left_path);
1650         right_el = path_leaf_el(right_path);
1651         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1652                 mlog(0, "Adjust records at index %u\n", i);
1653
1654                 /*
1655                  * One nice property of knowing that all of these
1656                  * nodes are below the root is that we only deal with
1657                  * the leftmost right node record and the rightmost
1658                  * left node record.
1659                  */
1660                 el = left_path->p_node[i].el;
1661                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1662                 left_rec = &el->l_recs[idx];
1663
1664                 el = right_path->p_node[i].el;
1665                 right_rec = &el->l_recs[0];
1666
1667                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1668                                               right_el);
1669
1670                 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1671                 if (ret)
1672                         mlog_errno(ret);
1673
1674                 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1675                 if (ret)
1676                         mlog_errno(ret);
1677
1678                 /*
1679                  * Setup our list pointers now so that the current
1680                  * parents become children in the next iteration.
1681                  */
1682                 left_el = left_path->p_node[i].el;
1683                 right_el = right_path->p_node[i].el;
1684         }
1685
1686         /*
1687          * At the root node, adjust the two adjacent records which
1688          * begin our path to the leaves.
1689          */
1690
1691         el = left_path->p_node[subtree_index].el;
1692         left_el = left_path->p_node[subtree_index + 1].el;
1693         right_el = right_path->p_node[subtree_index + 1].el;
1694
1695         ocfs2_adjust_root_records(el, left_el, right_el,
1696                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
1697
1698         root_bh = left_path->p_node[subtree_index].bh;
1699
1700         ret = ocfs2_journal_dirty(handle, root_bh);
1701         if (ret)
1702                 mlog_errno(ret);
1703 }
1704
1705 static int ocfs2_rotate_subtree_right(struct inode *inode,
1706                                       handle_t *handle,
1707                                       struct ocfs2_path *left_path,
1708                                       struct ocfs2_path *right_path,
1709                                       int subtree_index)
1710 {
1711         int ret, i;
1712         struct buffer_head *right_leaf_bh;
1713         struct buffer_head *left_leaf_bh = NULL;
1714         struct buffer_head *root_bh;
1715         struct ocfs2_extent_list *right_el, *left_el;
1716         struct ocfs2_extent_rec move_rec;
1717
1718         left_leaf_bh = path_leaf_bh(left_path);
1719         left_el = path_leaf_el(left_path);
1720
1721         if (left_el->l_next_free_rec != left_el->l_count) {
1722                 ocfs2_error(inode->i_sb,
1723                             "Inode %llu has non-full interior leaf node %llu"
1724                             "(next free = %u)",
1725                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1726                             (unsigned long long)left_leaf_bh->b_blocknr,
1727                             le16_to_cpu(left_el->l_next_free_rec));
1728                 return -EROFS;
1729         }
1730
1731         /*
1732          * This extent block may already have an empty record, so we
1733          * return early if so.
1734          */
1735         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1736                 return 0;
1737
1738         root_bh = left_path->p_node[subtree_index].bh;
1739         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1740
1741         ret = ocfs2_journal_access(handle, inode, root_bh,
1742                                    OCFS2_JOURNAL_ACCESS_WRITE);
1743         if (ret) {
1744                 mlog_errno(ret);
1745                 goto out;
1746         }
1747
1748         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1749                 ret = ocfs2_journal_access(handle, inode,
1750                                            right_path->p_node[i].bh,
1751                                            OCFS2_JOURNAL_ACCESS_WRITE);
1752                 if (ret) {
1753                         mlog_errno(ret);
1754                         goto out;
1755                 }
1756
1757                 ret = ocfs2_journal_access(handle, inode,
1758                                            left_path->p_node[i].bh,
1759                                            OCFS2_JOURNAL_ACCESS_WRITE);
1760                 if (ret) {
1761                         mlog_errno(ret);
1762                         goto out;
1763                 }
1764         }
1765
1766         right_leaf_bh = path_leaf_bh(right_path);
1767         right_el = path_leaf_el(right_path);
1768
1769         /* This is a code error, not a disk corruption. */
1770         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1771                         "because rightmost leaf block %llu is empty\n",
1772                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1773                         (unsigned long long)right_leaf_bh->b_blocknr);
1774
1775         ocfs2_create_empty_extent(right_el);
1776
1777         ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1778         if (ret) {
1779                 mlog_errno(ret);
1780                 goto out;
1781         }
1782
1783         /* Do the copy now. */
1784         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1785         move_rec = left_el->l_recs[i];
1786         right_el->l_recs[0] = move_rec;
1787
1788         /*
1789          * Clear out the record we just copied and shift everything
1790          * over, leaving an empty extent in the left leaf.
1791          *
1792          * We temporarily subtract from next_free_rec so that the
1793          * shift will lose the tail record (which is now defunct).
1794          */
1795         le16_add_cpu(&left_el->l_next_free_rec, -1);
1796         ocfs2_shift_records_right(left_el);
1797         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1798         le16_add_cpu(&left_el->l_next_free_rec, 1);
1799
1800         ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1801         if (ret) {
1802                 mlog_errno(ret);
1803                 goto out;
1804         }
1805
1806         ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1807                                 subtree_index);
1808
1809 out:
1810         return ret;
1811 }
1812
1813 /*
1814  * Given a full path, determine what cpos value would return us a path
1815  * containing the leaf immediately to the left of the current one.
1816  *
1817  * Will return zero if the path passed in is already the leftmost path.
1818  */
1819 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1820                                          struct ocfs2_path *path, u32 *cpos)
1821 {
1822         int i, j, ret = 0;
1823         u64 blkno;
1824         struct ocfs2_extent_list *el;
1825
1826         BUG_ON(path->p_tree_depth == 0);
1827
1828         *cpos = 0;
1829
1830         blkno = path_leaf_bh(path)->b_blocknr;
1831
1832         /* Start at the tree node just above the leaf and work our way up. */
1833         i = path->p_tree_depth - 1;
1834         while (i >= 0) {
1835                 el = path->p_node[i].el;
1836
1837                 /*
1838                  * Find the extent record just before the one in our
1839                  * path.
1840                  */
1841                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1842                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1843                                 if (j == 0) {
1844                                         if (i == 0) {
1845                                                 /*
1846                                                  * We've determined that the
1847                                                  * path specified is already
1848                                                  * the leftmost one - return a
1849                                                  * cpos of zero.
1850                                                  */
1851                                                 goto out;
1852                                         }
1853                                         /*
1854                                          * The leftmost record points to our
1855                                          * leaf - we need to travel up the
1856                                          * tree one level.
1857                                          */
1858                                         goto next_node;
1859                                 }
1860
1861                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1862                                 *cpos = *cpos + ocfs2_rec_clusters(el,
1863                                                            &el->l_recs[j - 1]);
1864                                 *cpos = *cpos - 1;
1865                                 goto out;
1866                         }
1867                 }
1868
1869                 /*
1870                  * If we got here, we never found a valid node where
1871                  * the tree indicated one should be.
1872                  */
1873                 ocfs2_error(sb,
1874                             "Invalid extent tree at extent block %llu\n",
1875                             (unsigned long long)blkno);
1876                 ret = -EROFS;
1877                 goto out;
1878
1879 next_node:
1880                 blkno = path->p_node[i].bh->b_blocknr;
1881                 i--;
1882         }
1883
1884 out:
1885         return ret;
1886 }
1887
1888 /*
1889  * Extend the transaction by enough credits to complete the rotation,
1890  * and still leave at least the original number of credits allocated
1891  * to this transaction.
1892  */
1893 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1894                                            int op_credits,
1895                                            struct ocfs2_path *path)
1896 {
1897         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
1898
1899         if (handle->h_buffer_credits < credits)
1900                 return ocfs2_extend_trans(handle, credits);
1901
1902         return 0;
1903 }
1904
1905 /*
1906  * Trap the case where we're inserting into the theoretical range past
1907  * the _actual_ left leaf range. Otherwise, we'll rotate a record
1908  * whose cpos is less than ours into the right leaf.
1909  *
1910  * It's only necessary to look at the rightmost record of the left
1911  * leaf because the logic that calls us should ensure that the
1912  * theoretical ranges in the path components above the leaves are
1913  * correct.
1914  */
1915 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1916                                                  u32 insert_cpos)
1917 {
1918         struct ocfs2_extent_list *left_el;
1919         struct ocfs2_extent_rec *rec;
1920         int next_free;
1921
1922         left_el = path_leaf_el(left_path);
1923         next_free = le16_to_cpu(left_el->l_next_free_rec);
1924         rec = &left_el->l_recs[next_free - 1];
1925
1926         if (insert_cpos > le32_to_cpu(rec->e_cpos))
1927                 return 1;
1928         return 0;
1929 }
1930
1931 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
1932 {
1933         int next_free = le16_to_cpu(el->l_next_free_rec);
1934         unsigned int range;
1935         struct ocfs2_extent_rec *rec;
1936
1937         if (next_free == 0)
1938                 return 0;
1939
1940         rec = &el->l_recs[0];
1941         if (ocfs2_is_empty_extent(rec)) {
1942                 /* Empty list. */
1943                 if (next_free == 1)
1944                         return 0;
1945                 rec = &el->l_recs[1];
1946         }
1947
1948         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1949         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1950                 return 1;
1951         return 0;
1952 }
1953
1954 /*
1955  * Rotate all the records in a btree right one record, starting at insert_cpos.
1956  *
1957  * The path to the rightmost leaf should be passed in.
1958  *
1959  * The array is assumed to be large enough to hold an entire path (tree depth).
1960  *
1961  * Upon succesful return from this function:
1962  *
1963  * - The 'right_path' array will contain a path to the leaf block
1964  *   whose range contains e_cpos.
1965  * - That leaf block will have a single empty extent in list index 0.
1966  * - In the case that the rotation requires a post-insert update,
1967  *   *ret_left_path will contain a valid path which can be passed to
1968  *   ocfs2_insert_path().
1969  */
1970 static int ocfs2_rotate_tree_right(struct inode *inode,
1971                                    handle_t *handle,
1972                                    enum ocfs2_split_type split,
1973                                    u32 insert_cpos,
1974                                    struct ocfs2_path *right_path,
1975                                    struct ocfs2_path **ret_left_path)
1976 {
1977         int ret, start, orig_credits = handle->h_buffer_credits;
1978         u32 cpos;
1979         struct ocfs2_path *left_path = NULL;
1980
1981         *ret_left_path = NULL;
1982
1983         left_path = ocfs2_new_path(path_root_bh(right_path),
1984                                    path_root_el(right_path));
1985         if (!left_path) {
1986                 ret = -ENOMEM;
1987                 mlog_errno(ret);
1988                 goto out;
1989         }
1990
1991         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
1992         if (ret) {
1993                 mlog_errno(ret);
1994                 goto out;
1995         }
1996
1997         mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
1998
1999         /*
2000          * What we want to do here is:
2001          *
2002          * 1) Start with the rightmost path.
2003          *
2004          * 2) Determine a path to the leaf block directly to the left
2005          *    of that leaf.
2006          *
2007          * 3) Determine the 'subtree root' - the lowest level tree node
2008          *    which contains a path to both leaves.
2009          *
2010          * 4) Rotate the subtree.
2011          *
2012          * 5) Find the next subtree by considering the left path to be
2013          *    the new right path.
2014          *
2015          * The check at the top of this while loop also accepts
2016          * insert_cpos == cpos because cpos is only a _theoretical_
2017          * value to get us the left path - insert_cpos might very well
2018          * be filling that hole.
2019          *
2020          * Stop at a cpos of '0' because we either started at the
2021          * leftmost branch (i.e., a tree with one branch and a
2022          * rotation inside of it), or we've gone as far as we can in
2023          * rotating subtrees.
2024          */
2025         while (cpos && insert_cpos <= cpos) {
2026                 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2027                      insert_cpos, cpos);
2028
2029                 ret = ocfs2_find_path(inode, left_path, cpos);
2030                 if (ret) {
2031                         mlog_errno(ret);
2032                         goto out;
2033                 }
2034
2035                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
2036                                 path_leaf_bh(right_path),
2037                                 "Inode %lu: error during insert of %u "
2038                                 "(left path cpos %u) results in two identical "
2039                                 "paths ending at %llu\n",
2040                                 inode->i_ino, insert_cpos, cpos,
2041                                 (unsigned long long)
2042                                 path_leaf_bh(left_path)->b_blocknr);
2043
2044                 if (split == SPLIT_NONE &&
2045                     ocfs2_rotate_requires_path_adjustment(left_path,
2046                                                           insert_cpos)) {
2047
2048                         /*
2049                          * We've rotated the tree as much as we
2050                          * should. The rest is up to
2051                          * ocfs2_insert_path() to complete, after the
2052                          * record insertion. We indicate this
2053                          * situation by returning the left path.
2054                          *
2055                          * The reason we don't adjust the records here
2056                          * before the record insert is that an error
2057                          * later might break the rule where a parent
2058                          * record e_cpos will reflect the actual
2059                          * e_cpos of the 1st nonempty record of the
2060                          * child list.
2061                          */
2062                         *ret_left_path = left_path;
2063                         goto out_ret_path;
2064                 }
2065
2066                 start = ocfs2_find_subtree_root(inode, left_path, right_path);
2067
2068                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2069                      start,
2070                      (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2071                      right_path->p_tree_depth);
2072
2073                 ret = ocfs2_extend_rotate_transaction(handle, start,
2074                                                       orig_credits, right_path);
2075                 if (ret) {
2076                         mlog_errno(ret);
2077                         goto out;
2078                 }
2079
2080                 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
2081                                                  right_path, start);
2082                 if (ret) {
2083                         mlog_errno(ret);
2084                         goto out;
2085                 }
2086
2087                 if (split != SPLIT_NONE &&
2088                     ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2089                                                 insert_cpos)) {
2090                         /*
2091                          * A rotate moves the rightmost left leaf
2092                          * record over to the leftmost right leaf
2093                          * slot. If we're doing an extent split
2094                          * instead of a real insert, then we have to
2095                          * check that the extent to be split wasn't
2096                          * just moved over. If it was, then we can
2097                          * exit here, passing left_path back -
2098                          * ocfs2_split_extent() is smart enough to
2099                          * search both leaves.
2100                          */
2101                         *ret_left_path = left_path;
2102                         goto out_ret_path;
2103                 }
2104
2105                 /*
2106                  * There is no need to re-read the next right path
2107                  * as we know that it'll be our current left
2108                  * path. Optimize by copying values instead.
2109                  */
2110                 ocfs2_mv_path(right_path, left_path);
2111
2112                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
2113                                                     &cpos);
2114                 if (ret) {
2115                         mlog_errno(ret);
2116                         goto out;
2117                 }
2118         }
2119
2120 out:
2121         ocfs2_free_path(left_path);
2122
2123 out_ret_path:
2124         return ret;
2125 }
2126
2127 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2128                                       struct ocfs2_path *path)
2129 {
2130         int i, idx;
2131         struct ocfs2_extent_rec *rec;
2132         struct ocfs2_extent_list *el;
2133         struct ocfs2_extent_block *eb;
2134         u32 range;
2135
2136         /* Path should always be rightmost. */
2137         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2138         BUG_ON(eb->h_next_leaf_blk != 0ULL);
2139
2140         el = &eb->h_list;
2141         BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2142         idx = le16_to_cpu(el->l_next_free_rec) - 1;
2143         rec = &el->l_recs[idx];
2144         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2145
2146         for (i = 0; i < path->p_tree_depth; i++) {
2147                 el = path->p_node[i].el;
2148                 idx = le16_to_cpu(el->l_next_free_rec) - 1;
2149                 rec = &el->l_recs[idx];
2150
2151                 rec->e_int_clusters = cpu_to_le32(range);
2152                 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2153
2154                 ocfs2_journal_dirty(handle, path->p_node[i].bh);
2155         }
2156 }
2157
2158 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2159                               struct ocfs2_cached_dealloc_ctxt *dealloc,
2160                               struct ocfs2_path *path, int unlink_start)
2161 {
2162         int ret, i;
2163         struct ocfs2_extent_block *eb;
2164         struct ocfs2_extent_list *el;
2165         struct buffer_head *bh;
2166
2167         for(i = unlink_start; i < path_num_items(path); i++) {
2168                 bh = path->p_node[i].bh;
2169
2170                 eb = (struct ocfs2_extent_block *)bh->b_data;
2171                 /*
2172                  * Not all nodes might have had their final count
2173                  * decremented by the caller - handle this here.
2174                  */
2175                 el = &eb->h_list;
2176                 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2177                         mlog(ML_ERROR,
2178                              "Inode %llu, attempted to remove extent block "
2179                              "%llu with %u records\n",
2180                              (unsigned long long)OCFS2_I(inode)->ip_blkno,
2181                              (unsigned long long)le64_to_cpu(eb->h_blkno),
2182                              le16_to_cpu(el->l_next_free_rec));
2183
2184                         ocfs2_journal_dirty(handle, bh);
2185                         ocfs2_remove_from_cache(inode, bh);
2186                         continue;
2187                 }
2188
2189                 el->l_next_free_rec = 0;
2190                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2191
2192                 ocfs2_journal_dirty(handle, bh);
2193
2194                 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2195                 if (ret)
2196                         mlog_errno(ret);
2197
2198                 ocfs2_remove_from_cache(inode, bh);
2199         }
2200 }
2201
2202 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2203                                  struct ocfs2_path *left_path,
2204                                  struct ocfs2_path *right_path,
2205                                  int subtree_index,
2206                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
2207 {
2208         int i;
2209         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2210         struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2211         struct ocfs2_extent_list *el;
2212         struct ocfs2_extent_block *eb;
2213
2214         el = path_leaf_el(left_path);
2215
2216         eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2217
2218         for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2219                 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2220                         break;
2221
2222         BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2223
2224         memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2225         le16_add_cpu(&root_el->l_next_free_rec, -1);
2226
2227         eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2228         eb->h_next_leaf_blk = 0;
2229
2230         ocfs2_journal_dirty(handle, root_bh);
2231         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2232
2233         ocfs2_unlink_path(inode, handle, dealloc, right_path,
2234                           subtree_index + 1);
2235 }
2236
2237 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2238                                      struct ocfs2_path *left_path,
2239                                      struct ocfs2_path *right_path,
2240                                      int subtree_index,
2241                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
2242                                      int *deleted,
2243                                      struct ocfs2_extent_tree *et)
2244 {
2245         int ret, i, del_right_subtree = 0, right_has_empty = 0;
2246         struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2247         struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2248         struct ocfs2_extent_block *eb;
2249
2250         *deleted = 0;
2251
2252         right_leaf_el = path_leaf_el(right_path);
2253         left_leaf_el = path_leaf_el(left_path);
2254         root_bh = left_path->p_node[subtree_index].bh;
2255         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2256
2257         if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2258                 return 0;
2259
2260         eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2261         if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2262                 /*
2263                  * It's legal for us to proceed if the right leaf is
2264                  * the rightmost one and it has an empty extent. There
2265                  * are two cases to handle - whether the leaf will be
2266                  * empty after removal or not. If the leaf isn't empty
2267                  * then just remove the empty extent up front. The
2268                  * next block will handle empty leaves by flagging
2269                  * them for unlink.
2270                  *
2271                  * Non rightmost leaves will throw -EAGAIN and the
2272                  * caller can manually move the subtree and retry.
2273                  */
2274
2275                 if (eb->h_next_leaf_blk != 0ULL)
2276                         return -EAGAIN;
2277
2278                 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2279                         ret = ocfs2_journal_access(handle, inode,
2280                                                    path_leaf_bh(right_path),
2281                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2282                         if (ret) {
2283                                 mlog_errno(ret);
2284                                 goto out;
2285                         }
2286
2287                         ocfs2_remove_empty_extent(right_leaf_el);
2288                 } else
2289                         right_has_empty = 1;
2290         }
2291
2292         if (eb->h_next_leaf_blk == 0ULL &&
2293             le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2294                 /*
2295                  * We have to update i_last_eb_blk during the meta
2296                  * data delete.
2297                  */
2298                 ret = ocfs2_journal_access(handle, inode, et_root_bh,
2299                                            OCFS2_JOURNAL_ACCESS_WRITE);
2300                 if (ret) {
2301                         mlog_errno(ret);
2302                         goto out;
2303                 }
2304
2305                 del_right_subtree = 1;
2306         }
2307
2308         /*
2309          * Getting here with an empty extent in the right path implies
2310          * that it's the rightmost path and will be deleted.
2311          */
2312         BUG_ON(right_has_empty && !del_right_subtree);
2313
2314         ret = ocfs2_journal_access(handle, inode, root_bh,
2315                                    OCFS2_JOURNAL_ACCESS_WRITE);
2316         if (ret) {
2317                 mlog_errno(ret);
2318                 goto out;
2319         }
2320
2321         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2322                 ret = ocfs2_journal_access(handle, inode,
2323                                            right_path->p_node[i].bh,
2324                                            OCFS2_JOURNAL_ACCESS_WRITE);
2325                 if (ret) {
2326                         mlog_errno(ret);
2327                         goto out;
2328                 }
2329
2330                 ret = ocfs2_journal_access(handle, inode,
2331                                            left_path->p_node[i].bh,
2332                                            OCFS2_JOURNAL_ACCESS_WRITE);
2333                 if (ret) {
2334                         mlog_errno(ret);
2335                         goto out;
2336                 }
2337         }
2338
2339         if (!right_has_empty) {
2340                 /*
2341                  * Only do this if we're moving a real
2342                  * record. Otherwise, the action is delayed until
2343                  * after removal of the right path in which case we
2344                  * can do a simple shift to remove the empty extent.
2345                  */
2346                 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2347                 memset(&right_leaf_el->l_recs[0], 0,
2348                        sizeof(struct ocfs2_extent_rec));
2349         }
2350         if (eb->h_next_leaf_blk == 0ULL) {
2351                 /*
2352                  * Move recs over to get rid of empty extent, decrease
2353                  * next_free. This is allowed to remove the last
2354                  * extent in our leaf (setting l_next_free_rec to
2355                  * zero) - the delete code below won't care.
2356                  */
2357                 ocfs2_remove_empty_extent(right_leaf_el);
2358         }
2359
2360         ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2361         if (ret)
2362                 mlog_errno(ret);
2363         ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2364         if (ret)
2365                 mlog_errno(ret);
2366
2367         if (del_right_subtree) {
2368                 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2369                                      subtree_index, dealloc);
2370                 ocfs2_update_edge_lengths(inode, handle, left_path);
2371
2372                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2373                 ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2374
2375                 /*
2376                  * Removal of the extent in the left leaf was skipped
2377                  * above so we could delete the right path
2378                  * 1st.
2379                  */
2380                 if (right_has_empty)
2381                         ocfs2_remove_empty_extent(left_leaf_el);
2382
2383                 ret = ocfs2_journal_dirty(handle, et_root_bh);
2384                 if (ret)
2385                         mlog_errno(ret);
2386
2387                 *deleted = 1;
2388         } else
2389                 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2390                                            subtree_index);
2391
2392 out:
2393         return ret;
2394 }
2395
2396 /*
2397  * Given a full path, determine what cpos value would return us a path
2398  * containing the leaf immediately to the right of the current one.
2399  *
2400  * Will return zero if the path passed in is already the rightmost path.
2401  *
2402  * This looks similar, but is subtly different to
2403  * ocfs2_find_cpos_for_left_leaf().
2404  */
2405 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2406                                           struct ocfs2_path *path, u32 *cpos)
2407 {
2408         int i, j, ret = 0;
2409         u64 blkno;
2410         struct ocfs2_extent_list *el;
2411
2412         *cpos = 0;
2413
2414         if (path->p_tree_depth == 0)
2415                 return 0;
2416
2417         blkno = path_leaf_bh(path)->b_blocknr;
2418
2419         /* Start at the tree node just above the leaf and work our way up. */
2420         i = path->p_tree_depth - 1;
2421         while (i >= 0) {
2422                 int next_free;
2423
2424                 el = path->p_node[i].el;
2425
2426                 /*
2427                  * Find the extent record just after the one in our
2428                  * path.
2429                  */
2430                 next_free = le16_to_cpu(el->l_next_free_rec);
2431                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2432                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2433                                 if (j == (next_free - 1)) {
2434                                         if (i == 0) {
2435                                                 /*
2436                                                  * We've determined that the
2437                                                  * path specified is already
2438                                                  * the rightmost one - return a
2439                                                  * cpos of zero.
2440                                                  */
2441                                                 goto out;
2442                                         }
2443                                         /*
2444                                          * The rightmost record points to our
2445                                          * leaf - we need to travel up the
2446                                          * tree one level.
2447                                          */
2448                                         goto next_node;
2449                                 }
2450
2451                                 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2452                                 goto out;
2453                         }
2454                 }
2455
2456                 /*
2457                  * If we got here, we never found a valid node where
2458                  * the tree indicated one should be.
2459                  */
2460                 ocfs2_error(sb,
2461                             "Invalid extent tree at extent block %llu\n",
2462                             (unsigned long long)blkno);
2463                 ret = -EROFS;
2464                 goto out;
2465
2466 next_node:
2467                 blkno = path->p_node[i].bh->b_blocknr;
2468                 i--;
2469         }
2470
2471 out:
2472         return ret;
2473 }
2474
2475 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2476                                             handle_t *handle,
2477                                             struct buffer_head *bh,
2478                                             struct ocfs2_extent_list *el)
2479 {
2480         int ret;
2481
2482         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2483                 return 0;
2484
2485         ret = ocfs2_journal_access(handle, inode, bh,
2486                                    OCFS2_JOURNAL_ACCESS_WRITE);
2487         if (ret) {
2488                 mlog_errno(ret);
2489                 goto out;
2490         }
2491
2492         ocfs2_remove_empty_extent(el);
2493
2494         ret = ocfs2_journal_dirty(handle, bh);
2495         if (ret)
2496                 mlog_errno(ret);
2497
2498 out:
2499         return ret;
2500 }
2501
2502 static int __ocfs2_rotate_tree_left(struct inode *inode,
2503                                     handle_t *handle, int orig_credits,
2504                                     struct ocfs2_path *path,
2505                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2506                                     struct ocfs2_path **empty_extent_path,
2507                                     struct ocfs2_extent_tree *et)
2508 {
2509         int ret, subtree_root, deleted;
2510         u32 right_cpos;
2511         struct ocfs2_path *left_path = NULL;
2512         struct ocfs2_path *right_path = NULL;
2513
2514         BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2515
2516         *empty_extent_path = NULL;
2517
2518         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2519                                              &right_cpos);
2520         if (ret) {
2521                 mlog_errno(ret);
2522                 goto out;
2523         }
2524
2525         left_path = ocfs2_new_path(path_root_bh(path),
2526                                    path_root_el(path));
2527         if (!left_path) {
2528                 ret = -ENOMEM;
2529                 mlog_errno(ret);
2530                 goto out;
2531         }
2532
2533         ocfs2_cp_path(left_path, path);
2534
2535         right_path = ocfs2_new_path(path_root_bh(path),
2536                                     path_root_el(path));
2537         if (!right_path) {
2538                 ret = -ENOMEM;
2539                 mlog_errno(ret);
2540                 goto out;
2541         }
2542
2543         while (right_cpos) {
2544                 ret = ocfs2_find_path(inode, right_path, right_cpos);
2545                 if (ret) {
2546                         mlog_errno(ret);
2547                         goto out;
2548                 }
2549
2550                 subtree_root = ocfs2_find_subtree_root(inode, left_path,
2551                                                        right_path);
2552
2553                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2554                      subtree_root,
2555                      (unsigned long long)
2556                      right_path->p_node[subtree_root].bh->b_blocknr,
2557                      right_path->p_tree_depth);
2558
2559                 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2560                                                       orig_credits, left_path);
2561                 if (ret) {
2562                         mlog_errno(ret);
2563                         goto out;
2564                 }
2565
2566                 /*
2567                  * Caller might still want to make changes to the
2568                  * tree root, so re-add it to the journal here.
2569                  */
2570                 ret = ocfs2_journal_access(handle, inode,
2571                                            path_root_bh(left_path),
2572                                            OCFS2_JOURNAL_ACCESS_WRITE);
2573                 if (ret) {
2574                         mlog_errno(ret);
2575                         goto out;
2576                 }
2577
2578                 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2579                                                 right_path, subtree_root,
2580                                                 dealloc, &deleted, et);
2581                 if (ret == -EAGAIN) {
2582                         /*
2583                          * The rotation has to temporarily stop due to
2584                          * the right subtree having an empty
2585                          * extent. Pass it back to the caller for a
2586                          * fixup.
2587                          */
2588                         *empty_extent_path = right_path;
2589                         right_path = NULL;
2590                         goto out;
2591                 }
2592                 if (ret) {
2593                         mlog_errno(ret);
2594                         goto out;
2595                 }
2596
2597                 /*
2598                  * The subtree rotate might have removed records on
2599                  * the rightmost edge. If so, then rotation is
2600                  * complete.
2601                  */
2602                 if (deleted)
2603                         break;
2604
2605                 ocfs2_mv_path(left_path, right_path);
2606
2607                 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2608                                                      &right_cpos);
2609                 if (ret) {
2610                         mlog_errno(ret);
2611                         goto out;
2612                 }
2613         }
2614
2615 out:
2616         ocfs2_free_path(right_path);
2617         ocfs2_free_path(left_path);
2618
2619         return ret;
2620 }
2621
2622 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2623                                 struct ocfs2_path *path,
2624                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
2625                                 struct ocfs2_extent_tree *et)
2626 {
2627         int ret, subtree_index;
2628         u32 cpos;
2629         struct ocfs2_path *left_path = NULL;
2630         struct ocfs2_extent_block *eb;
2631         struct ocfs2_extent_list *el;
2632
2633
2634         ret = et->eops->sanity_check(inode, et);
2635         if (ret)
2636                 goto out;
2637         /*
2638          * There's two ways we handle this depending on
2639          * whether path is the only existing one.
2640          */
2641         ret = ocfs2_extend_rotate_transaction(handle, 0,
2642                                               handle->h_buffer_credits,
2643                                               path);
2644         if (ret) {
2645                 mlog_errno(ret);
2646                 goto out;
2647         }
2648
2649         ret = ocfs2_journal_access_path(inode, handle, path);
2650         if (ret) {
2651                 mlog_errno(ret);
2652                 goto out;
2653         }
2654
2655         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2656         if (ret) {
2657                 mlog_errno(ret);
2658                 goto out;
2659         }
2660
2661         if (cpos) {
2662                 /*
2663                  * We have a path to the left of this one - it needs
2664                  * an update too.
2665                  */
2666                 left_path = ocfs2_new_path(path_root_bh(path),
2667                                            path_root_el(path));
2668                 if (!left_path) {
2669                         ret = -ENOMEM;
2670                         mlog_errno(ret);
2671                         goto out;
2672                 }
2673
2674                 ret = ocfs2_find_path(inode, left_path, cpos);
2675                 if (ret) {
2676                         mlog_errno(ret);
2677                         goto out;
2678                 }
2679
2680                 ret = ocfs2_journal_access_path(inode, handle, left_path);
2681                 if (ret) {
2682                         mlog_errno(ret);
2683                         goto out;
2684                 }
2685
2686                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2687
2688                 ocfs2_unlink_subtree(inode, handle, left_path, path,
2689                                      subtree_index, dealloc);
2690                 ocfs2_update_edge_lengths(inode, handle, left_path);
2691
2692                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2693                 ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2694         } else {
2695                 /*
2696                  * 'path' is also the leftmost path which
2697                  * means it must be the only one. This gets
2698                  * handled differently because we want to
2699                  * revert the inode back to having extents
2700                  * in-line.
2701                  */
2702                 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2703
2704                 el = et->root_el;
2705                 el->l_tree_depth = 0;
2706                 el->l_next_free_rec = 0;
2707                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2708
2709                 ocfs2_set_last_eb_blk(et, 0);
2710         }
2711
2712         ocfs2_journal_dirty(handle, path_root_bh(path));
2713
2714 out:
2715         ocfs2_free_path(left_path);
2716         return ret;
2717 }
2718
2719 /*
2720  * Left rotation of btree records.
2721  *
2722  * In many ways, this is (unsurprisingly) the opposite of right
2723  * rotation. We start at some non-rightmost path containing an empty
2724  * extent in the leaf block. The code works its way to the rightmost
2725  * path by rotating records to the left in every subtree.
2726  *
2727  * This is used by any code which reduces the number of extent records
2728  * in a leaf. After removal, an empty record should be placed in the
2729  * leftmost list position.
2730  *
2731  * This won't handle a length update of the rightmost path records if
2732  * the rightmost tree leaf record is removed so the caller is
2733  * responsible for detecting and correcting that.
2734  */
2735 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2736                                   struct ocfs2_path *path,
2737                                   struct ocfs2_cached_dealloc_ctxt *dealloc,
2738                                   struct ocfs2_extent_tree *et)
2739 {
2740         int ret, orig_credits = handle->h_buffer_credits;
2741         struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2742         struct ocfs2_extent_block *eb;
2743         struct ocfs2_extent_list *el;
2744
2745         el = path_leaf_el(path);
2746         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2747                 return 0;
2748
2749         if (path->p_tree_depth == 0) {
2750 rightmost_no_delete:
2751                 /*
2752                  * Inline extents. This is trivially handled, so do
2753                  * it up front.
2754                  */
2755                 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2756                                                        path_leaf_bh(path),
2757                                                        path_leaf_el(path));
2758                 if (ret)
2759                         mlog_errno(ret);
2760                 goto out;
2761         }
2762
2763         /*
2764          * Handle rightmost branch now. There's several cases:
2765          *  1) simple rotation leaving records in there. That's trivial.
2766          *  2) rotation requiring a branch delete - there's no more
2767          *     records left. Two cases of this:
2768          *     a) There are branches to the left.
2769          *     b) This is also the leftmost (the only) branch.
2770          *
2771          *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
2772          *  2a) we need the left branch so that we can update it with the unlink
2773          *  2b) we need to bring the inode back to inline extents.
2774          */
2775
2776         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2777         el = &eb->h_list;
2778         if (eb->h_next_leaf_blk == 0) {
2779                 /*
2780                  * This gets a bit tricky if we're going to delete the
2781                  * rightmost path. Get the other cases out of the way
2782                  * 1st.
2783                  */
2784                 if (le16_to_cpu(el->l_next_free_rec) > 1)
2785                         goto rightmost_no_delete;
2786
2787                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
2788                         ret = -EIO;
2789                         ocfs2_error(inode->i_sb,
2790                                     "Inode %llu has empty extent block at %llu",
2791                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
2792                                     (unsigned long long)le64_to_cpu(eb->h_blkno));
2793                         goto out;
2794                 }
2795
2796                 /*
2797                  * XXX: The caller can not trust "path" any more after
2798                  * this as it will have been deleted. What do we do?
2799                  *
2800                  * In theory the rotate-for-merge code will never get
2801                  * here because it'll always ask for a rotate in a
2802                  * nonempty list.
2803                  */
2804
2805                 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2806                                                   dealloc, et);
2807                 if (ret)
2808                         mlog_errno(ret);
2809                 goto out;
2810         }
2811
2812         /*
2813          * Now we can loop, remembering the path we get from -EAGAIN
2814          * and restarting from there.
2815          */
2816 try_rotate:
2817         ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2818                                        dealloc, &restart_path, et);
2819         if (ret && ret != -EAGAIN) {
2820                 mlog_errno(ret);
2821                 goto out;
2822         }
2823
2824         while (ret == -EAGAIN) {
2825                 tmp_path = restart_path;
2826                 restart_path = NULL;
2827
2828                 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2829                                                tmp_path, dealloc,
2830                                                &restart_path, et);
2831                 if (ret && ret != -EAGAIN) {
2832                         mlog_errno(ret);
2833                         goto out;
2834                 }
2835
2836                 ocfs2_free_path(tmp_path);
2837                 tmp_path = NULL;
2838
2839                 if (ret == 0)
2840                         goto try_rotate;
2841         }
2842
2843 out:
2844         ocfs2_free_path(tmp_path);
2845         ocfs2_free_path(restart_path);
2846         return ret;
2847 }
2848
2849 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2850                                 int index)
2851 {
2852         struct ocfs2_extent_rec *rec = &el->l_recs[index];
2853         unsigned int size;
2854
2855         if (rec->e_leaf_clusters == 0) {
2856                 /*
2857                  * We consumed all of the merged-from record. An empty
2858                  * extent cannot exist anywhere but the 1st array
2859                  * position, so move things over if the merged-from
2860                  * record doesn't occupy that position.
2861                  *
2862                  * This creates a new empty extent so the caller
2863                  * should be smart enough to have removed any existing
2864                  * ones.
2865                  */
2866                 if (index > 0) {
2867                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2868                         size = index * sizeof(struct ocfs2_extent_rec);
2869                         memmove(&el->l_recs[1], &el->l_recs[0], size);
2870                 }
2871
2872                 /*
2873                  * Always memset - the caller doesn't check whether it
2874                  * created an empty extent, so there could be junk in
2875                  * the other fields.
2876                  */
2877                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2878         }
2879 }
2880
2881 static int ocfs2_get_right_path(struct inode *inode,
2882                                 struct ocfs2_path *left_path,
2883                                 struct ocfs2_path **ret_right_path)
2884 {
2885         int ret;
2886         u32 right_cpos;
2887         struct ocfs2_path *right_path = NULL;
2888         struct ocfs2_extent_list *left_el;
2889
2890         *ret_right_path = NULL;
2891
2892         /* This function shouldn't be called for non-trees. */
2893         BUG_ON(left_path->p_tree_depth == 0);
2894
2895         left_el = path_leaf_el(left_path);
2896         BUG_ON(left_el->l_next_free_rec != left_el->l_count);
2897
2898         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2899                                              &right_cpos);
2900         if (ret) {
2901                 mlog_errno(ret);
2902                 goto out;
2903         }
2904
2905         /* This function shouldn't be called for the rightmost leaf. */
2906         BUG_ON(right_cpos == 0);
2907
2908         right_path = ocfs2_new_path(path_root_bh(left_path),
2909                                     path_root_el(left_path));
2910         if (!right_path) {
2911                 ret = -ENOMEM;
2912                 mlog_errno(ret);
2913                 goto out;
2914         }
2915
2916         ret = ocfs2_find_path(inode, right_path, right_cpos);
2917         if (ret) {
2918                 mlog_errno(ret);
2919                 goto out;
2920         }
2921
2922         *ret_right_path = right_path;
2923 out:
2924         if (ret)
2925                 ocfs2_free_path(right_path);
2926         return ret;
2927 }
2928
2929 /*
2930  * Remove split_rec clusters from the record at index and merge them
2931  * onto the beginning of the record "next" to it.
2932  * For index < l_count - 1, the next means the extent rec at index + 1.
2933  * For index == l_count - 1, the "next" means the 1st extent rec of the
2934  * next extent block.
2935  */
2936 static int ocfs2_merge_rec_right(struct inode *inode,
2937                                  struct ocfs2_path *left_path,
2938                                  handle_t *handle,
2939                                  struct ocfs2_extent_rec *split_rec,
2940                                  int index)
2941 {
2942         int ret, next_free, i;
2943         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2944         struct ocfs2_extent_rec *left_rec;
2945         struct ocfs2_extent_rec *right_rec;
2946         struct ocfs2_extent_list *right_el;
2947         struct ocfs2_path *right_path = NULL;
2948         int subtree_index = 0;
2949         struct ocfs2_extent_list *el = path_leaf_el(left_path);
2950         struct buffer_head *bh = path_leaf_bh(left_path);
2951         struct buffer_head *root_bh = NULL;
2952
2953         BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
2954         left_rec = &el->l_recs[index];
2955
2956         if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
2957             le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
2958                 /* we meet with a cross extent block merge. */
2959                 ret = ocfs2_get_right_path(inode, left_path, &right_path);
2960                 if (ret) {
2961                         mlog_errno(ret);
2962                         goto out;
2963                 }
2964
2965                 right_el = path_leaf_el(right_path);
2966                 next_free = le16_to_cpu(right_el->l_next_free_rec);
2967                 BUG_ON(next_free <= 0);
2968                 right_rec = &right_el->l_recs[0];
2969                 if (ocfs2_is_empty_extent(right_rec)) {
2970                         BUG_ON(next_free <= 1);
2971                         right_rec = &right_el->l_recs[1];
2972                 }
2973
2974                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
2975                        le16_to_cpu(left_rec->e_leaf_clusters) !=
2976                        le32_to_cpu(right_rec->e_cpos));
2977
2978                 subtree_index = ocfs2_find_subtree_root(inode,
2979                                                         left_path, right_path);
2980
2981                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
2982                                                       handle->h_buffer_credits,
2983                                                       right_path);
2984                 if (ret) {
2985                         mlog_errno(ret);
2986                         goto out;
2987                 }
2988
2989                 root_bh = left_path->p_node[subtree_index].bh;
2990                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2991
2992                 ret = ocfs2_journal_access(handle, inode, root_bh,
2993                                            OCFS2_JOURNAL_ACCESS_WRITE);
2994                 if (ret) {
2995                         mlog_errno(ret);
2996                         goto out;
2997                 }
2998
2999                 for (i = subtree_index + 1;
3000                      i < path_num_items(right_path); i++) {
3001                         ret = ocfs2_journal_access(handle, inode,
3002                                                    right_path->p_node[i].bh,
3003                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3004                         if (ret) {
3005                                 mlog_errno(ret);
3006                                 goto out;
3007                         }
3008
3009                         ret = ocfs2_journal_access(handle, inode,
3010                                                    left_path->p_node[i].bh,
3011                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3012                         if (ret) {
3013                                 mlog_errno(ret);
3014                                 goto out;
3015                         }
3016                 }
3017
3018         } else {
3019                 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3020                 right_rec = &el->l_recs[index + 1];
3021         }
3022
3023         ret = ocfs2_journal_access(handle, inode, bh,
3024                                    OCFS2_JOURNAL_ACCESS_WRITE);
3025         if (ret) {
3026                 mlog_errno(ret);
3027                 goto out;
3028         }
3029
3030         le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3031
3032         le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3033         le64_add_cpu(&right_rec->e_blkno,
3034                      -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3035         le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3036
3037         ocfs2_cleanup_merge(el, index);
3038
3039         ret = ocfs2_journal_dirty(handle, bh);
3040         if (ret)
3041                 mlog_errno(ret);
3042
3043         if (right_path) {
3044                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3045                 if (ret)
3046                         mlog_errno(ret);
3047
3048                 ocfs2_complete_edge_insert(inode, handle, left_path,
3049                                            right_path, subtree_index);
3050         }
3051 out:
3052         if (right_path)
3053                 ocfs2_free_path(right_path);
3054         return ret;
3055 }
3056
3057 static int ocfs2_get_left_path(struct inode *inode,
3058                                struct ocfs2_path *right_path,
3059                                struct ocfs2_path **ret_left_path)
3060 {
3061         int ret;
3062         u32 left_cpos;
3063         struct ocfs2_path *left_path = NULL;
3064
3065         *ret_left_path = NULL;
3066
3067         /* This function shouldn't be called for non-trees. */
3068         BUG_ON(right_path->p_tree_depth == 0);
3069
3070         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3071                                             right_path, &left_cpos);
3072         if (ret) {
3073                 mlog_errno(ret);
3074                 goto out;
3075         }
3076
3077         /* This function shouldn't be called for the leftmost leaf. */
3078         BUG_ON(left_cpos == 0);
3079
3080         left_path = ocfs2_new_path(path_root_bh(right_path),
3081                                    path_root_el(right_path));
3082         if (!left_path) {
3083                 ret = -ENOMEM;
3084                 mlog_errno(ret);
3085                 goto out;
3086         }
3087
3088         ret = ocfs2_find_path(inode, left_path, left_cpos);
3089         if (ret) {
3090                 mlog_errno(ret);
3091                 goto out;
3092         }
3093
3094         *ret_left_path = left_path;
3095 out:
3096         if (ret)
3097                 ocfs2_free_path(left_path);
3098         return ret;
3099 }
3100
3101 /*
3102  * Remove split_rec clusters from the record at index and merge them
3103  * onto the tail of the record "before" it.
3104  * For index > 0, the "before" means the extent rec at index - 1.
3105  *
3106  * For index == 0, the "before" means the last record of the previous
3107  * extent block. And there is also a situation that we may need to
3108  * remove the rightmost leaf extent block in the right_path and change
3109  * the right path to indicate the new rightmost path.
3110  */
3111 static int ocfs2_merge_rec_left(struct inode *inode,
3112                                 struct ocfs2_path *right_path,
3113                                 handle_t *handle,
3114                                 struct ocfs2_extent_rec *split_rec,
3115                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
3116                                 struct ocfs2_extent_tree *et,
3117                                 int index)
3118 {
3119         int ret, i, subtree_index = 0, has_empty_extent = 0;
3120         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3121         struct ocfs2_extent_rec *left_rec;
3122         struct ocfs2_extent_rec *right_rec;
3123         struct ocfs2_extent_list *el = path_leaf_el(right_path);
3124         struct buffer_head *bh = path_leaf_bh(right_path);
3125         struct buffer_head *root_bh = NULL;
3126         struct ocfs2_path *left_path = NULL;
3127         struct ocfs2_extent_list *left_el;
3128
3129         BUG_ON(index < 0);
3130
3131         right_rec = &el->l_recs[index];
3132         if (index == 0) {
3133                 /* we meet with a cross extent block merge. */
3134                 ret = ocfs2_get_left_path(inode, right_path, &left_path);
3135                 if (ret) {
3136                         mlog_errno(ret);
3137                         goto out;
3138                 }
3139
3140                 left_el = path_leaf_el(left_path);
3141                 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3142                        le16_to_cpu(left_el->l_count));
3143
3144                 left_rec = &left_el->l_recs[
3145                                 le16_to_cpu(left_el->l_next_free_rec) - 1];
3146                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3147                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3148                        le32_to_cpu(split_rec->e_cpos));
3149
3150                 subtree_index = ocfs2_find_subtree_root(inode,
3151                                                         left_path, right_path);
3152
3153                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3154                                                       handle->h_buffer_credits,
3155                                                       left_path);
3156                 if (ret) {
3157                         mlog_errno(ret);
3158                         goto out;
3159                 }
3160
3161                 root_bh = left_path->p_node[subtree_index].bh;
3162                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3163
3164                 ret = ocfs2_journal_access(handle, inode, root_bh,
3165                                            OCFS2_JOURNAL_ACCESS_WRITE);
3166                 if (ret) {
3167                         mlog_errno(ret);
3168                         goto out;
3169                 }
3170
3171                 for (i = subtree_index + 1;
3172                      i < path_num_items(right_path); i++) {
3173                         ret = ocfs2_journal_access(handle, inode,
3174                                                    right_path->p_node[i].bh,
3175                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3176                         if (ret) {
3177                                 mlog_errno(ret);
3178                                 goto out;
3179                         }
3180
3181                         ret = ocfs2_journal_access(handle, inode,
3182                                                    left_path->p_node[i].bh,
3183                                                    OCFS2_JOURNAL_ACCESS_WRITE);
3184                         if (ret) {
3185                                 mlog_errno(ret);
3186                                 goto out;
3187                         }
3188                 }
3189         } else {
3190                 left_rec = &el->l_recs[index - 1];
3191                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3192                         has_empty_extent = 1;
3193         }
3194
3195         ret = ocfs2_journal_access(handle, inode, bh,
3196                                    OCFS2_JOURNAL_ACCESS_WRITE);
3197         if (ret) {
3198                 mlog_errno(ret);
3199                 goto out;
3200         }
3201
3202         if (has_empty_extent && index == 1) {
3203                 /*
3204                  * The easy case - we can just plop the record right in.
3205                  */
3206                 *left_rec = *split_rec;
3207
3208                 has_empty_extent = 0;
3209         } else
3210                 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3211
3212         le32_add_cpu(&right_rec->e_cpos, split_clusters);
3213         le64_add_cpu(&right_rec->e_blkno,
3214                      ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3215         le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3216
3217         ocfs2_cleanup_merge(el, index);
3218
3219         ret = ocfs2_journal_dirty(handle, bh);
3220         if (ret)
3221                 mlog_errno(ret);
3222
3223         if (left_path) {
3224                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3225                 if (ret)
3226                         mlog_errno(ret);
3227
3228                 /*
3229                  * In the situation that the right_rec is empty and the extent
3230                  * block is empty also,  ocfs2_complete_edge_insert can't handle
3231                  * it and we need to delete the right extent block.
3232                  */
3233                 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3234                     le16_to_cpu(el->l_next_free_rec) == 1) {
3235
3236                         ret = ocfs2_remove_rightmost_path(inode, handle,
3237                                                           right_path,
3238                                                           dealloc, et);
3239                         if (ret) {
3240                                 mlog_errno(ret);
3241                                 goto out;
3242                         }
3243
3244                         /* Now the rightmost extent block has been deleted.
3245                          * So we use the new rightmost path.
3246                          */
3247                         ocfs2_mv_path(right_path, left_path);
3248                         left_path = NULL;
3249                 } else
3250                         ocfs2_complete_edge_insert(inode, handle, left_path,
3251                                                    right_path, subtree_index);
3252         }
3253 out:
3254         if (left_path)
3255                 ocfs2_free_path(left_path);
3256         return ret;
3257 }
3258
3259 static int ocfs2_try_to_merge_extent(struct inode *inode,
3260                                      handle_t *handle,
3261                                      struct ocfs2_path *path,
3262                                      int split_index,
3263                                      struct ocfs2_extent_rec *split_rec,
3264                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
3265                                      struct ocfs2_merge_ctxt *ctxt,
3266                                      struct ocfs2_extent_tree *et)
3267
3268 {
3269         int ret = 0;
3270         struct ocfs2_extent_list *el = path_leaf_el(path);
3271         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3272
3273         BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3274
3275         if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3276                 /*
3277                  * The merge code will need to create an empty
3278                  * extent to take the place of the newly
3279                  * emptied slot. Remove any pre-existing empty
3280                  * extents - having more than one in a leaf is
3281                  * illegal.
3282                  */
3283                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3284                                              dealloc, et);
3285                 if (ret) {
3286                         mlog_errno(ret);
3287                         goto out;
3288                 }
3289                 split_index--;
3290                 rec = &el->l_recs[split_index];
3291         }
3292
3293         if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3294                 /*
3295                  * Left-right contig implies this.
3296                  */
3297                 BUG_ON(!ctxt->c_split_covers_rec);
3298
3299                 /*
3300                  * Since the leftright insert always covers the entire
3301                  * extent, this call will delete the insert record
3302                  * entirely, resulting in an empty extent record added to
3303                  * the extent block.
3304                  *
3305                  * Since the adding of an empty extent shifts
3306                  * everything back to the right, there's no need to
3307                  * update split_index here.
3308                  *
3309                  * When the split_index is zero, we need to merge it to the
3310                  * prevoius extent block. It is more efficient and easier
3311                  * if we do merge_right first and merge_left later.
3312                  */
3313                 ret = ocfs2_merge_rec_right(inode, path,
3314                                             handle, split_rec,
3315                                             split_index);
3316                 if (ret) {
3317                         mlog_errno(ret);
3318                         goto out;
3319                 }
3320
3321                 /*
3322                  * We can only get this from logic error above.
3323                  */
3324                 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3325
3326                 /* The merge left us with an empty extent, remove it. */
3327                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3328                                              dealloc, et);
3329                 if (ret) {
3330                         mlog_errno(ret);
3331                         goto out;
3332                 }
3333
3334                 rec = &el->l_recs[split_index];
3335
3336                 /*
3337                  * Note that we don't pass split_rec here on purpose -
3338                  * we've merged it into the rec already.
3339                  */
3340                 ret = ocfs2_merge_rec_left(inode, path,
3341                                            handle, rec,
3342                                            dealloc, et,
3343                                            split_index);
3344
3345                 if (ret) {
3346                         mlog_errno(ret);
3347                         goto out;
3348                 }
3349
3350                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3351                                              dealloc, et);
3352                 /*
3353                  * Error from this last rotate is not critical, so
3354                  * print but don't bubble it up.
3355                  */
3356                 if (ret)
3357                         mlog_errno(ret);
3358                 ret = 0;
3359         } else {
3360                 /*
3361                  * Merge a record to the left or right.
3362                  *
3363                  * 'contig_type' is relative to the existing record,
3364                  * so for example, if we're "right contig", it's to
3365                  * the record on the left (hence the left merge).
3366                  */
3367                 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3368                         ret = ocfs2_merge_rec_left(inode,
3369                                                    path,
3370                                                    handle, split_rec,
3371                                                    dealloc, et,
3372                                                    split_index);
3373                         if (ret) {
3374                                 mlog_errno(ret);
3375                                 goto out;
3376                         }
3377                 } else {
3378                         ret = ocfs2_merge_rec_right(inode,
3379                                                     path,
3380                                                     handle, split_rec,
3381                                                     split_index);
3382                         if (ret) {
3383                                 mlog_errno(ret);
3384                                 goto out;
3385                         }
3386                 }
3387
3388                 if (ctxt->c_split_covers_rec) {
3389                         /*
3390                          * The merge may have left an empty extent in
3391                          * our leaf. Try to rotate it away.
3392                          */
3393                         ret = ocfs2_rotate_tree_left(inode, handle, path,
3394                                                      dealloc, et);
3395                         if (ret)
3396                                 mlog_errno(ret);
3397                         ret = 0;
3398                 }
3399         }
3400
3401 out:
3402         return ret;
3403 }
3404
3405 static void ocfs2_subtract_from_rec(struct super_block *sb,
3406                                     enum ocfs2_split_type split,
3407                                     struct ocfs2_extent_rec *rec,
3408                                     struct ocfs2_extent_rec *split_rec)
3409 {
3410         u64 len_blocks;
3411
3412         len_blocks = ocfs2_clusters_to_blocks(sb,
3413                                 le16_to_cpu(split_rec->e_leaf_clusters));
3414
3415         if (split == SPLIT_LEFT) {
3416                 /*
3417                  * Region is on the left edge of the existing
3418                  * record.
3419                  */
3420                 le32_add_cpu(&rec->e_cpos,
3421                              le16_to_cpu(split_rec->e_leaf_clusters));
3422                 le64_add_cpu(&rec->e_blkno, len_blocks);
3423                 le16_add_cpu(&rec->e_leaf_clusters,
3424                              -le16_to_cpu(split_rec->e_leaf_clusters));
3425         } else {
3426                 /*
3427                  * Region is on the right edge of the existing
3428                  * record.
3429                  */
3430                 le16_add_cpu(&rec->e_leaf_clusters,
3431                              -le16_to_cpu(split_rec->e_leaf_clusters));
3432         }
3433 }
3434
3435 /*
3436  * Do the final bits of extent record insertion at the target leaf
3437  * list. If this leaf is part of an allocation tree, it is assumed
3438  * that the tree above has been prepared.
3439  */
3440 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3441                                  struct ocfs2_extent_list *el,
3442                                  struct ocfs2_insert_type *insert,
3443                                  struct inode *inode)
3444 {
3445         int i = insert->ins_contig_index;
3446         unsigned int range;
3447         struct ocfs2_extent_rec *rec;
3448
3449         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3450
3451         if (insert->ins_split != SPLIT_NONE) {
3452                 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3453                 BUG_ON(i == -1);
3454                 rec = &el->l_recs[i];
3455                 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3456                                         insert_rec);
3457                 goto rotate;
3458         }
3459
3460         /*
3461          * Contiguous insert - either left or right.
3462          */
3463         if (insert->ins_contig != CONTIG_NONE) {
3464                 rec = &el->l_recs[i];
3465                 if (insert->ins_contig == CONTIG_LEFT) {
3466                         rec->e_blkno = insert_rec->e_blkno;
3467                         rec->e_cpos = insert_rec->e_cpos;
3468                 }
3469                 le16_add_cpu(&rec->e_leaf_clusters,
3470                              le16_to_cpu(insert_rec->e_leaf_clusters));
3471                 return;
3472         }
3473
3474         /*
3475          * Handle insert into an empty leaf.
3476          */
3477         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3478             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3479              ocfs2_is_empty_extent(&el->l_recs[0]))) {
3480                 el->l_recs[0] = *insert_rec;
3481                 el->l_next_free_rec = cpu_to_le16(1);
3482                 return;
3483         }
3484
3485         /*
3486          * Appending insert.
3487          */
3488         if (insert->ins_appending == APPEND_TAIL) {
3489                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3490                 rec = &el->l_recs[i];
3491                 range = le32_to_cpu(rec->e_cpos)
3492                         + le16_to_cpu(rec->e_leaf_clusters);
3493                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3494
3495                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3496                                 le16_to_cpu(el->l_count),
3497                                 "inode %lu, depth %u, count %u, next free %u, "
3498                                 "rec.cpos %u, rec.clusters %u, "
3499                                 "insert.cpos %u, insert.clusters %u\n",
3500                                 inode->i_ino,
3501                                 le16_to_cpu(el->l_tree_depth),
3502                                 le16_to_cpu(el->l_count),
3503                                 le16_to_cpu(el->l_next_free_rec),
3504                                 le32_to_cpu(el->l_recs[i].e_cpos),
3505                                 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3506                                 le32_to_cpu(insert_rec->e_cpos),
3507                                 le16_to_cpu(insert_rec->e_leaf_clusters));
3508                 i++;
3509                 el->l_recs[i] = *insert_rec;
3510                 le16_add_cpu(&el->l_next_free_rec, 1);
3511                 return;
3512         }
3513
3514 rotate:
3515         /*
3516          * Ok, we have to rotate.
3517          *
3518          * At this point, it is safe to assume that inserting into an
3519          * empty leaf and appending to a leaf have both been handled
3520          * above.
3521          *
3522          * This leaf needs to have space, either by the empty 1st
3523          * extent record, or by virtue of an l_next_rec < l_count.
3524          */
3525         ocfs2_rotate_leaf(el, insert_rec);
3526 }
3527
3528 static void ocfs2_adjust_rightmost_records(struct inode *inode,
3529                                            handle_t *handle,
3530                                            struct ocfs2_path *path,
3531                                            struct ocfs2_extent_rec *insert_rec)
3532 {
3533         int ret, i, next_free;
3534         struct buffer_head *bh;
3535         struct ocfs2_extent_list *el;
3536         struct ocfs2_extent_rec *rec;
3537
3538         /*
3539          * Update everything except the leaf block.
3540          */
3541         for (i = 0; i < path->p_tree_depth; i++) {
3542                 bh = path->p_node[i].bh;
3543                 el = path->p_node[i].el;
3544
3545                 next_free = le16_to_cpu(el->l_next_free_rec);
3546                 if (next_free == 0) {
3547                         ocfs2_error(inode->i_sb,
3548                                     "Dinode %llu has a bad extent list",
3549                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
3550                         ret = -EIO;
3551                         return;
3552                 }
3553
3554                 rec = &el->l_recs[next_free - 1];
3555
3556                 rec->e_int_clusters = insert_rec->e_cpos;
3557                 le32_add_cpu(&rec->e_int_clusters,
3558                              le16_to_cpu(insert_rec->e_leaf_clusters));
3559                 le32_add_cpu(&rec->e_int_clusters,
3560                              -le32_to_cpu(rec->e_cpos));
3561
3562                 ret = ocfs2_journal_dirty(handle, bh);
3563                 if (ret)
3564                         mlog_errno(ret);
3565
3566         }
3567 }
3568
3569 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3570                                     struct ocfs2_extent_rec *insert_rec,
3571                                     struct ocfs2_path *right_path,
3572                                     struct ocfs2_path **ret_left_path)
3573 {
3574         int ret, next_free;
3575         struct ocfs2_extent_list *el;
3576         struct ocfs2_path *left_path = NULL;
3577
3578         *ret_left_path = NULL;
3579
3580         /*
3581          * This shouldn't happen for non-trees. The extent rec cluster
3582          * count manipulation below only works for interior nodes.
3583          */
3584         BUG_ON(right_path->p_tree_depth == 0);
3585
3586         /*
3587          * If our appending insert is at the leftmost edge of a leaf,
3588          * then we might need to update the rightmost records of the
3589          * neighboring path.
3590          */
3591         el = path_leaf_el(right_path);
3592         next_free = le16_to_cpu(el->l_next_free_rec);
3593         if (next_free == 0 ||
3594             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3595                 u32 left_cpos;
3596
3597                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3598                                                     &left_cpos);
3599                 if (ret) {
3600                         mlog_errno(ret);
3601                         goto out;
3602                 }
3603
3604                 mlog(0, "Append may need a left path update. cpos: %u, "
3605                      "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3606                      left_cpos);
3607
3608                 /*
3609                  * No need to worry if the append is already in the
3610                  * leftmost leaf.
3611                  */
3612                 if (left_cpos) {
3613                         left_path = ocfs2_new_path(path_root_bh(right_path),
3614                                                    path_root_el(right_path));
3615                         if (!left_path) {
3616                                 ret = -ENOMEM;
3617                                 mlog_errno(ret);
3618                                 goto out;
3619                         }
3620
3621                         ret = ocfs2_find_path(inode, left_path, left_cpos);
3622                         if (ret) {
3623                                 mlog_errno(ret);
3624                                 goto out;
3625                         }
3626
3627                         /*
3628                          * ocfs2_insert_path() will pass the left_path to the
3629                          * journal for us.
3630                          */
3631                 }
3632         }
3633
3634         ret = ocfs2_journal_access_path(inode, handle, right_path);
3635         if (ret) {
3636                 mlog_errno(ret);
3637                 goto out;
3638         }
3639
3640         ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3641
3642         *ret_left_path = left_path;
3643         ret = 0;
3644 out:
3645         if (ret != 0)
3646                 ocfs2_free_path(left_path);
3647
3648         return ret;
3649 }
3650
3651 static void ocfs2_split_record(struct inode *inode,
3652                                struct ocfs2_path *left_path,
3653                                struct ocfs2_path *right_path,
3654                                struct ocfs2_extent_rec *split_rec,
3655                                enum ocfs2_split_type split)
3656 {
3657         int index;
3658         u32 cpos = le32_to_cpu(split_rec->e_cpos);
3659         struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3660         struct ocfs2_extent_rec *rec, *tmprec;
3661
3662         right_el = path_leaf_el(right_path);;
3663         if (left_path)
3664                 left_el = path_leaf_el(left_path);
3665
3666         el = right_el;
3667         insert_el = right_el;
3668         index = ocfs2_search_extent_list(el, cpos);
3669         if (index != -1) {
3670                 if (index == 0 && left_path) {
3671                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3672
3673                         /*
3674                          * This typically means that the record
3675                          * started in the left path but moved to the
3676                          * right as a result of rotation. We either
3677                          * move the existing record to the left, or we
3678                          * do the later insert there.
3679                          *
3680                          * In this case, the left path should always
3681                          * exist as the rotate code will have passed
3682                          * it back for a post-insert update.
3683                          */
3684
3685                         if (split == SPLIT_LEFT) {
3686                                 /*
3687                                  * It's a left split. Since we know
3688                                  * that the rotate code gave us an
3689                                  * empty extent in the left path, we
3690                                  * can just do the insert there.
3691                                  */
3692                                 insert_el = left_el;
3693                         } else {
3694                                 /*
3695                                  * Right split - we have to move the
3696                                  * existing record over to the left
3697                                  * leaf. The insert will be into the
3698                                  * newly created empty extent in the
3699                                  * right leaf.
3700                                  */
3701                                 tmprec = &right_el->l_recs[index];
3702                                 ocfs2_rotate_leaf(left_el, tmprec);
3703                                 el = left_el;
3704
3705                                 memset(tmprec, 0, sizeof(*tmprec));
3706                                 index = ocfs2_search_extent_list(left_el, cpos);
3707                                 BUG_ON(index == -1);
3708                         }
3709                 }
3710         } else {
3711                 BUG_ON(!left_path);
3712                 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3713                 /*
3714                  * Left path is easy - we can just allow the insert to
3715                  * happen.
3716                  */
3717                 el = left_el;
3718                 insert_el = left_el;
3719                 index = ocfs2_search_extent_list(el, cpos);
3720                 BUG_ON(index == -1);
3721         }
3722
3723         rec = &el->l_recs[index];
3724         ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3725         ocfs2_rotate_leaf(insert_el, split_rec);
3726 }
3727
3728 /*
3729  * This function only does inserts on an allocation b-tree. For tree
3730  * depth = 0, ocfs2_insert_at_leaf() is called directly.
3731  *
3732  * right_path is the path we want to do the actual insert
3733  * in. left_path should only be passed in if we need to update that
3734  * portion of the tree after an edge insert.
3735  */
3736 static int ocfs2_insert_path(struct inode *inode,
3737                              handle_t *handle,
3738                              struct ocfs2_path *left_path,
3739                              struct ocfs2_path *right_path,
3740                              struct ocfs2_extent_rec *insert_rec,
3741                              struct ocfs2_insert_type *insert)
3742 {
3743         int ret, subtree_index;
3744         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3745
3746         if (left_path) {
3747                 int credits = handle->h_buffer_credits;
3748
3749                 /*
3750                  * There's a chance that left_path got passed back to
3751                  * us without being accounted for in the
3752                  * journal. Extend our transaction here to be sure we
3753                  * can change those blocks.
3754                  */
3755                 credits += left_path->p_tree_depth;
3756
3757                 ret = ocfs2_extend_trans(handle, credits);
3758                 if (ret < 0) {
3759                         mlog_errno(ret);
3760                         goto out;
3761                 }
3762
3763                 ret = ocfs2_journal_access_path(inode, handle, left_path);
3764                 if (ret < 0) {
3765                         mlog_errno(ret);
3766                         goto out;
3767                 }
3768         }
3769
3770         /*
3771          * Pass both paths to the journal. The majority of inserts
3772          * will be touching all components anyway.
3773          */
3774         ret = ocfs2_journal_access_path(inode, handle, right_path);
3775         if (ret < 0) {
3776                 mlog_errno(ret);
3777                 goto out;
3778         }
3779
3780         if (insert->ins_split != SPLIT_NONE) {
3781                 /*
3782                  * We could call ocfs2_insert_at_leaf() for some types
3783                  * of splits, but it's easier to just let one separate
3784                  * function sort it all out.
3785                  */
3786                 ocfs2_split_record(inode, left_path, right_path,
3787                                    insert_rec, insert->ins_split);
3788
3789                 /*
3790                  * Split might have modified either leaf and we don't
3791                  * have a guarantee that the later edge insert will
3792                  * dirty this for us.
3793                  */
3794                 if (left_path)
3795                         ret = ocfs2_journal_dirty(handle,
3796                                                   path_leaf_bh(left_path));
3797                         if (ret)
3798                                 mlog_errno(ret);
3799         } else
3800                 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3801                                      insert, inode);
3802
3803         ret = ocfs2_journal_dirty(handle, leaf_bh);
3804         if (ret)
3805                 mlog_errno(ret);
3806
3807         if (left_path) {
3808                 /*
3809                  * The rotate code has indicated that we need to fix
3810                  * up portions of the tree after the insert.
3811                  *
3812                  * XXX: Should we extend the transaction here?
3813                  */
3814                 subtree_index = ocfs2_find_subtree_root(inode, left_path,
3815                                                         right_path);
3816                 ocfs2_complete_edge_insert(inode, handle, left_path,
3817                                            right_path, subtree_index);
3818         }
3819
3820         ret = 0;
3821 out:
3822         return ret;
3823 }
3824
3825 static int ocfs2_do_insert_extent(struct inode *inode,
3826                                   handle_t *handle,
3827                                   struct ocfs2_extent_tree *et,
3828                                   struct ocfs2_extent_rec *insert_rec,
3829                                   struct ocfs2_insert_type *type)
3830 {
3831         int ret, rotate = 0;
3832         u32 cpos;
3833         struct ocfs2_path *right_path = NULL;
3834         struct ocfs2_path *left_path = NULL;
3835         struct ocfs2_extent_list *el;
3836
3837         el = et->root_el;
3838
3839         ret = ocfs2_journal_access(handle, inode, et->root_bh,
3840                                    OCFS2_JOURNAL_ACCESS_WRITE);
3841         if (ret) {
3842                 mlog_errno(ret);
3843                 goto out;
3844         }
3845
3846         if (le16_to_cpu(el->l_tree_depth) == 0) {
3847                 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
3848                 goto out_update_clusters;
3849         }
3850
3851         right_path = ocfs2_new_path(et->root_bh, et->root_el);
3852         if (!right_path) {
3853                 ret = -ENOMEM;
3854                 mlog_errno(ret);
3855                 goto out;
3856         }
3857
3858         /*
3859          * Determine the path to start with. Rotations need the
3860          * rightmost path, everything else can go directly to the
3861          * target leaf.
3862          */
3863         cpos = le32_to_cpu(insert_rec->e_cpos);
3864         if (type->ins_appending == APPEND_NONE &&
3865             type->ins_contig == CONTIG_NONE) {
3866                 rotate = 1;
3867                 cpos = UINT_MAX;
3868         }
3869
3870         ret = ocfs2_find_path(inode, right_path, cpos);
3871         if (ret) {
3872                 mlog_errno(ret);
3873                 goto out;
3874         }
3875
3876         /*
3877          * Rotations and appends need special treatment - they modify
3878          * parts of the tree's above them.
3879          *
3880          * Both might pass back a path immediate to the left of the
3881          * one being inserted to. This will be cause
3882          * ocfs2_insert_path() to modify the rightmost records of
3883          * left_path to account for an edge insert.
3884          *
3885          * XXX: When modifying this code, keep in mind that an insert
3886          * can wind up skipping both of these two special cases...
3887          */
3888         if (rotate) {
3889                 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
3890                                               le32_to_cpu(insert_rec->e_cpos),
3891                                               right_path, &left_path);
3892                 if (ret) {
3893                         mlog_errno(ret);
3894                         goto out;
3895                 }
3896
3897                 /*
3898                  * ocfs2_rotate_tree_right() might have extended the
3899                  * transaction without re-journaling our tree root.
3900                  */
3901                 ret = ocfs2_journal_access(handle, inode, et->root_bh,
3902                                            OCFS2_JOURNAL_ACCESS_WRITE);
3903                 if (ret) {
3904                         mlog_errno(ret);
3905                         goto out;
3906                 }
3907         } else if (type->ins_appending == APPEND_TAIL
3908                    && type->ins_contig != CONTIG_LEFT) {
3909                 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
3910                                                right_path, &left_path);
3911                 if (ret) {
3912                         mlog_errno(ret);
3913                         goto out;
3914                 }
3915         }
3916
3917         ret = ocfs2_insert_path(inode, handle, left_path, right_path,
3918                                 insert_rec, type);
3919         if (ret) {
3920                 mlog_errno(ret);
3921                 goto out;
3922         }
3923
3924 out_update_clusters:
3925         if (type->ins_split == SPLIT_NONE)
3926                 ocfs2_update_clusters(inode, et,
3927                                       le16_to_cpu(insert_rec->e_leaf_clusters));
3928
3929         ret = ocfs2_journal_dirty(handle, et->root_bh);
3930         if (ret)
3931                 mlog_errno(ret);
3932
3933 out:
3934         ocfs2_free_path(left_path);
3935         ocfs2_free_path(right_path);
3936
3937         return ret;
3938 }
3939
3940 static enum ocfs2_contig_type
3941 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
3942                                struct ocfs2_extent_list *el, int index,
3943                                struct ocfs2_extent_rec *split_rec)
3944 {
3945         int status;
3946         enum ocfs2_contig_type ret = CONTIG_NONE;
3947         u32 left_cpos, right_cpos;
3948         struct ocfs2_extent_rec *rec = NULL;
3949         struct ocfs2_extent_list *new_el;
3950         struct ocfs2_path *left_path = NULL, *right_path = NULL;
3951         struct buffer_head *bh;
3952         struct ocfs2_extent_block *eb;
3953
3954         if (index > 0) {
3955                 rec = &el->l_recs[index - 1];
3956         } else if (path->p_tree_depth > 0) {
3957                 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3958                                                        path, &left_cpos);
3959                 if (status)
3960                         goto out;
3961
3962                 if (left_cpos != 0) {
3963                         left_path = ocfs2_new_path(path_root_bh(path),
3964                                                    path_root_el(path));
3965                         if (!left_path)
3966                                 goto out;
3967
3968                         status = ocfs2_find_path(inode, left_path, left_cpos);
3969                         if (status)
3970                                 goto out;
3971
3972                         new_el = path_leaf_el(left_path);
3973
3974                         if (le16_to_cpu(new_el->l_next_free_rec) !=
3975                             le16_to_cpu(new_el->l_count)) {
3976                                 bh = path_leaf_bh(left_path);
3977                                 eb = (struct ocfs2_extent_block *)bh->b_data;
3978                                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
3979                                                                  eb);
3980                                 goto out;
3981                         }
3982                         rec = &new_el->l_recs[
3983                                 le16_to_cpu(new_el->l_next_free_rec) - 1];
3984                 }
3985         }
3986
3987         /*
3988          * We're careful to check for an empty extent record here -
3989          * the merge code will know what to do if it sees one.
3990          */
3991         if (rec) {
3992                 if (index == 1 && ocfs2_is_empty_extent(rec)) {
3993                         if (split_rec->e_cpos == el->l_recs[index].e_cpos)
3994                                 ret = CONTIG_RIGHT;
3995                 } else {
3996                         ret = ocfs2_extent_contig(inode, rec, split_rec);
3997                 }
3998         }
3999
4000         rec = NULL;
4001         if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4002                 rec = &el->l_recs[index + 1];
4003         else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4004                  path->p_tree_depth > 0) {
4005                 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
4006                                                         path, &right_cpos);
4007                 if (status)
4008                         goto out;
4009
4010                 if (right_cpos == 0)
4011                         goto out;
4012
4013                 right_path = ocfs2_new_path(path_root_bh(path),
4014                                             path_root_el(path));
4015                 if (!right_path)
4016                         goto out;
4017
4018                 status = ocfs2_find_path(inode, right_path, right_cpos);
4019                 if (status)
4020                         goto out;
4021
4022                 new_el = path_leaf_el(right_path);
4023                 rec = &new_el->l_recs[0];
4024                 if (ocfs2_is_empty_extent(rec)) {
4025                         if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4026                                 bh = path_leaf_bh(right_path);
4027                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4028                                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
4029                                                                  eb);
4030                                 goto out;
4031                         }
4032                         rec = &new_el->l_recs[1];
4033                 }
4034         }
4035
4036         if (rec) {
4037                 enum ocfs2_contig_type contig_type;
4038
4039                 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
4040
4041                 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4042                         ret = CONTIG_LEFTRIGHT;
4043                 else if (ret == CONTIG_NONE)
4044                         ret = contig_type;
4045         }
4046
4047 out:
4048         if (left_path)
4049                 ocfs2_free_path(left_path);
4050         if (right_path)
4051                 ocfs2_free_path(right_path);
4052
4053         return ret;
4054 }
4055
4056 static void ocfs2_figure_contig_type(struct inode *inode,
4057                                      struct ocfs2_insert_type *insert,
4058                                      struct ocfs2_extent_list *el,
4059                                      struct ocfs2_extent_rec *insert_rec)
4060 {
4061         int i;
4062         enum ocfs2_contig_type contig_type = CONTIG_NONE;
4063
4064         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4065
4066         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4067                 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
4068                                                   insert_rec);
4069                 if (contig_type != CONTIG_NONE) {
4070                         insert->ins_contig_index = i;
4071                         break;
4072                 }
4073         }
4074         insert->ins_contig = contig_type;
4075 }
4076
4077 /*
4078  * This should only be called against the righmost leaf extent list.
4079  *
4080  * ocfs2_figure_appending_type() will figure out whether we'll have to
4081  * insert at the tail of the rightmost leaf.
4082  *
4083  * This should also work against the root extent list for tree's with 0
4084  * depth. If we consider the root extent list to be the rightmost leaf node
4085  * then the logic here makes sense.
4086  */
4087 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4088                                         struct ocfs2_extent_list *el,
4089                                         struct ocfs2_extent_rec *insert_rec)
4090 {
4091         int i;
4092         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4093         struct ocfs2_extent_rec *rec;
4094
4095         insert->ins_appending = APPEND_NONE;
4096
4097         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4098
4099         if (!el->l_next_free_rec)
4100                 goto set_tail_append;
4101
4102         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4103                 /* Were all records empty? */
4104                 if (le16_to_cpu(el->l_next_free_rec) == 1)
4105                         goto set_tail_append;
4106         }
4107
4108         i = le16_to_cpu(el->l_next_free_rec) - 1;
4109         rec = &el->l_recs[i];
4110
4111         if (cpos >=
4112             (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4113                 goto set_tail_append;
4114
4115         return;
4116
4117 set_tail_append:
4118         insert->ins_appending = APPEND_TAIL;
4119 }
4120
4121 /*
4122  * Helper function called at the begining of an insert.
4123  *
4124  * This computes a few things that are commonly used in the process of
4125  * inserting into the btree:
4126  *   - Whether the new extent is contiguous with an existing one.
4127  *   - The current tree depth.
4128  *   - Whether the insert is an appending one.
4129  *   - The total # of free records in the tree.
4130  *
4131  * All of the information is stored on the ocfs2_insert_type
4132  * structure.
4133  */
4134 static int ocfs2_figure_insert_type(struct inode *inode,
4135                                     struct ocfs2_extent_tree *et,
4136                                     struct buffer_head **last_eb_bh,
4137                                     struct ocfs2_extent_rec *insert_rec,
4138                                     int *free_records,
4139                                     struct ocfs2_insert_type *insert)
4140 {
4141         int ret;
4142         struct ocfs2_extent_block *eb;
4143         struct ocfs2_extent_list *el;
4144         struct ocfs2_path *path = NULL;
4145         struct buffer_head *bh = NULL;
4146
4147         insert->ins_split = SPLIT_NONE;
4148
4149         el = et->root_el;
4150         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4151
4152         if (el->l_tree_depth) {
4153                 /*
4154                  * If we have tree depth, we read in the
4155                  * rightmost extent block ahead of time as
4156                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
4157                  * may want it later.
4158                  */
4159                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4160                                        ocfs2_get_last_eb_blk(et), &bh,
4161                                        OCFS2_BH_CACHED, inode);
4162                 if (ret) {
4163                         mlog_exit(ret);
4164                         goto out;
4165                 }
4166                 eb = (struct ocfs2_extent_block *) bh->b_data;
4167                 el = &eb->h_list;
4168         }
4169
4170         /*
4171          * Unless we have a contiguous insert, we'll need to know if
4172          * there is room left in our allocation tree for another
4173          * extent record.
4174          *
4175          * XXX: This test is simplistic, we can search for empty
4176          * extent records too.
4177          */
4178         *free_records = le16_to_cpu(el->l_count) -
4179                 le16_to_cpu(el->l_next_free_rec);
4180
4181         if (!insert->ins_tree_depth) {
4182                 ocfs2_figure_contig_type(inode, insert, el, insert_rec);
4183                 ocfs2_figure_appending_type(insert, el, insert_rec);
4184                 return 0;
4185         }
4186
4187         path = ocfs2_new_path(et->root_bh, et->root_el);
4188         if (!path) {
4189                 ret = -ENOMEM;
4190                 mlog_errno(ret);
4191                 goto out;
4192         }
4193
4194         /*
4195          * In the case that we're inserting past what the tree
4196          * currently accounts for, ocfs2_find_path() will return for
4197          * us the rightmost tree path. This is accounted for below in
4198          * the appending code.
4199          */
4200         ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4201         if (ret) {
4202                 mlog_errno(ret);
4203                 goto out;
4204         }
4205
4206         el = path_leaf_el(path);
4207
4208         /*
4209          * Now that we have the path, there's two things we want to determine:
4210          * 1) Contiguousness (also set contig_index if this is so)
4211          *
4212          * 2) Are we doing an append? We can trivially break this up
4213          *     into two types of appends: simple record append, or a
4214          *     rotate inside the tail leaf.
4215          */
4216         ocfs2_figure_contig_type(inode, insert, el, insert_rec);
4217
4218         /*
4219          * The insert code isn't quite ready to deal with all cases of
4220          * left contiguousness. Specifically, if it's an insert into
4221          * the 1st record in a leaf, it will require the adjustment of
4222          * cluster count on the last record of the path directly to it's
4223          * left. For now, just catch that case and fool the layers
4224          * above us. This works just fine for tree_depth == 0, which
4225          * is why we allow that above.
4226          */
4227         if (insert->ins_contig == CONTIG_LEFT &&
4228             insert->ins_contig_index == 0)
4229                 insert->ins_contig = CONTIG_NONE;
4230
4231         /*
4232          * Ok, so we can simply compare against last_eb to figure out
4233          * whether the path doesn't exist. This will only happen in
4234          * the case that we're doing a tail append, so maybe we can
4235          * take advantage of that information somehow.
4236          */
4237         if (ocfs2_get_last_eb_blk(et) ==
4238             path_leaf_bh(path)->b_blocknr) {
4239                 /*
4240                  * Ok, ocfs2_find_path() returned us the rightmost
4241                  * tree path. This might be an appending insert. There are
4242                  * two cases:
4243                  *    1) We're doing a true append at the tail:
4244                  *      -This might even be off the end of the leaf
4245                  *    2) We're "appending" by rotating in the tail
4246                  */
4247                 ocfs2_figure_appending_type(insert, el, insert_rec);
4248         }
4249
4250 out:
4251         ocfs2_free_path(path);
4252
4253         if (ret == 0)
4254                 *last_eb_bh = bh;
4255         else
4256                 brelse(bh);
4257         return ret;
4258 }
4259
4260 /*
4261  * Insert an extent into an inode btree.
4262  *
4263  * The caller needs to update fe->i_clusters
4264  */
4265 static int ocfs2_insert_extent(struct ocfs2_super *osb,
4266                                handle_t *handle,
4267                                struct inode *inode,
4268                                struct buffer_head *root_bh,
4269                                u32 cpos,
4270                                u64 start_blk,
4271                                u32 new_clusters,
4272                                u8 flags,
4273                                struct ocfs2_alloc_context *meta_ac,
4274                                struct ocfs2_extent_tree *et)
4275 {
4276         int status;
4277         int uninitialized_var(free_records);
4278         struct buffer_head *last_eb_bh = NULL;
4279         struct ocfs2_insert_type insert = {0, };
4280         struct ocfs2_extent_rec rec;
4281
4282         BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
4283
4284         mlog(0, "add %u clusters at position %u to inode %llu\n",
4285              new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4286
4287         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
4288                         (OCFS2_I(inode)->ip_clusters != cpos),
4289                         "Device %s, asking for sparse allocation: inode %llu, "
4290                         "cpos %u, clusters %u\n",
4291                         osb->dev_str,
4292                         (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
4293                         OCFS2_I(inode)->ip_clusters);
4294
4295         memset(&rec, 0, sizeof(rec));
4296         rec.e_cpos = cpu_to_le32(cpos);
4297         rec.e_blkno = cpu_to_le64(start_blk);
4298         rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4299         rec.e_flags = flags;
4300
4301         status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
4302                                           &free_records, &insert);
4303         if (status < 0) {
4304                 mlog_errno(status);
4305                 goto bail;
4306         }
4307
4308         mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4309              "Insert.contig_index: %d, Insert.free_records: %d, "
4310              "Insert.tree_depth: %d\n",
4311              insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4312              free_records, insert.ins_tree_depth);
4313
4314         if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4315                 status = ocfs2_grow_tree(inode, handle, et,
4316                                          &insert.ins_tree_depth, &last_eb_bh,
4317                                          meta_ac);
4318                 if (status) {
4319                         mlog_errno(status);
4320                         goto bail;
4321                 }
4322         }
4323
4324         /* Finally, we can add clusters. This might rotate the tree for us. */
4325         status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
4326         if (status < 0)
4327                 mlog_errno(status);
4328         else if (et->type == OCFS2_DINODE_EXTENT)
4329                 ocfs2_extent_map_insert_rec(inode, &rec);
4330
4331 bail:
4332         if (last_eb_bh)
4333                 brelse(last_eb_bh);
4334
4335         mlog_exit(status);
4336         return status;
4337 }
4338
4339 int ocfs2_dinode_insert_extent(struct ocfs2_super *osb,
4340                                handle_t *handle,
4341                                struct inode *inode,
4342                                struct buffer_head *root_bh,
4343                                u32 cpos,
4344                                u64 start_blk,
4345                                u32 new_clusters,
4346                                u8 flags,
4347                                struct ocfs2_alloc_context *meta_ac)
4348 {
4349         int status;
4350         struct ocfs2_extent_tree *et = NULL;
4351
4352         et = ocfs2_new_extent_tree(root_bh, OCFS2_DINODE_EXTENT, NULL);
4353         if (!et) {
4354                 status = -ENOMEM;
4355                 mlog_errno(status);
4356                 goto bail;
4357         }
4358
4359         status = ocfs2_insert_extent(osb, handle, inode, root_bh,
4360                                      cpos, start_blk, new_clusters,
4361                                      flags, meta_ac, et);
4362
4363         if (et)
4364                 ocfs2_free_extent_tree(et);
4365 bail:
4366         return status;
4367 }
4368
4369 int ocfs2_xattr_value_insert_extent(struct ocfs2_super *osb,
4370                                     handle_t *handle,
4371                                     struct inode *inode,
4372                                     struct buffer_head *root_bh,
4373                                     u32 cpos,
4374                                     u64 start_blk,
4375                                     u32 new_clusters,
4376                                     u8 flags,
4377                                     struct ocfs2_alloc_context *meta_ac,
4378                                     void *private)
4379 {
4380         int status;
4381         struct ocfs2_extent_tree *et = NULL;
4382
4383         et = ocfs2_new_extent_tree(root_bh, OCFS2_XATTR_VALUE_EXTENT, private);
4384         if (!et) {
4385                 status = -ENOMEM;
4386                 mlog_errno(status);
4387                 goto bail;
4388         }
4389
4390         status = ocfs2_insert_extent(osb, handle, inode, root_bh,
4391                                      cpos, start_blk, new_clusters,
4392                                      flags, meta_ac, et);
4393
4394         if (et)
4395                 ocfs2_free_extent_tree(et);
4396 bail:
4397         return status;
4398 }
4399
4400 /*
4401  * Allcate and add clusters into the extent b-tree.
4402  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4403  * The extent b-tree's root is root_el and it should be in root_bh, and
4404  * it is not limited to the file storage. Any extent tree can use this
4405  * function if it implements the proper ocfs2_extent_tree.
4406  */
4407 int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4408                                 struct inode *inode,
4409                                 u32 *logical_offset,
4410                                 u32 clusters_to_add,
4411                                 int mark_unwritten,
4412                                 struct buffer_head *root_bh,
4413                                 struct ocfs2_extent_list *root_el,
4414                                 handle_t *handle,
4415                                 struct ocfs2_alloc_context *data_ac,
4416                                 struct ocfs2_alloc_context *meta_ac,
4417                                 enum ocfs2_alloc_restarted *reason_ret,
4418                                 enum ocfs2_extent_tree_type type,
4419                                 void *private)
4420 {
4421         int status = 0;
4422         int free_extents;
4423         enum ocfs2_alloc_restarted reason = RESTART_NONE;
4424         u32 bit_off, num_bits;
4425         u64 block;
4426         u8 flags = 0;
4427
4428         BUG_ON(!clusters_to_add);
4429
4430         if (mark_unwritten)
4431                 flags = OCFS2_EXT_UNWRITTEN;
4432
4433         free_extents = ocfs2_num_free_extents(osb, inode, root_bh, type,
4434                                               private);
4435         if (free_extents < 0) {
4436                 status = free_extents;
4437                 mlog_errno(status);
4438                 goto leave;
4439         }
4440
4441         /* there are two cases which could cause us to EAGAIN in the
4442          * we-need-more-metadata case:
4443          * 1) we haven't reserved *any*
4444          * 2) we are so fragmented, we've needed to add metadata too
4445          *    many times. */
4446         if (!free_extents && !meta_ac) {
4447                 mlog(0, "we haven't reserved any metadata!\n");
4448                 status = -EAGAIN;
4449                 reason = RESTART_META;
4450                 goto leave;
4451         } else if ((!free_extents)
4452                    && (ocfs2_alloc_context_bits_left(meta_ac)
4453                        < ocfs2_extend_meta_needed(root_el))) {
4454                 mlog(0, "filesystem is really fragmented...\n");
4455                 status = -EAGAIN;
4456                 reason = RESTART_META;
4457                 goto leave;
4458         }
4459
4460         status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4461                                         clusters_to_add, &bit_off, &num_bits);
4462         if (status < 0) {
4463                 if (status != -ENOSPC)
4464                         mlog_errno(status);
4465                 goto leave;
4466         }
4467
4468         BUG_ON(num_bits > clusters_to_add);
4469
4470         /* reserve our write early -- insert_extent may update the inode */
4471         status = ocfs2_journal_access(handle, inode, root_bh,
4472                                       OCFS2_JOURNAL_ACCESS_WRITE);
4473         if (status < 0) {
4474                 mlog_errno(status);
4475                 goto leave;
4476         }
4477
4478         block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4479         mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
4480              num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4481         if (type == OCFS2_DINODE_EXTENT)
4482                 status = ocfs2_dinode_insert_extent(osb, handle, inode, root_bh,
4483                                                     *logical_offset, block,
4484                                                     num_bits, flags, meta_ac);
4485         else
4486                 status = ocfs2_xattr_value_insert_extent(osb, handle,
4487                                                          inode, root_bh,
4488                                                          *logical_offset,
4489                                                          block, num_bits, flags,
4490                                                          meta_ac, private);
4491         if (status < 0) {
4492                 mlog_errno(status);
4493                 goto leave;
4494         }
4495
4496         status = ocfs2_journal_dirty(handle, root_bh);
4497         if (status < 0) {
4498                 mlog_errno(status);
4499                 goto leave;
4500         }
4501
4502         clusters_to_add -= num_bits;
4503         *logical_offset += num_bits;
4504
4505         if (clusters_to_add) {
4506                 mlog(0, "need to alloc once more, wanted = %u\n",
4507                      clusters_to_add);
4508                 status = -EAGAIN;
4509                 reason = RESTART_TRANS;
4510         }
4511
4512 leave:
4513         mlog_exit(status);
4514         if (reason_ret)
4515                 *reason_ret = reason;
4516         return status;
4517 }
4518
4519 static void ocfs2_make_right_split_rec(struct super_block *sb,
4520                                        struct ocfs2_extent_rec *split_rec,
4521                                        u32 cpos,
4522                                        struct ocfs2_extent_rec *rec)
4523 {
4524         u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4525         u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4526
4527         memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4528
4529         split_rec->e_cpos = cpu_to_le32(cpos);
4530         split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4531
4532         split_rec->e_blkno = rec->e_blkno;
4533         le64_add_cpu(&split_rec->e_blkno,
4534                      ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4535
4536         split_rec->e_flags = rec->e_flags;
4537 }
4538
4539 static int ocfs2_split_and_insert(struct inode *inode,
4540                                   handle_t *handle,
4541                                   struct ocfs2_path *path,
4542                                   struct ocfs2_extent_tree *et,
4543                                   struct buffer_head **last_eb_bh,
4544                                   int split_index,
4545                                   struct ocfs2_extent_rec *orig_split_rec,
4546                                   struct ocfs2_alloc_context *meta_ac)
4547 {
4548         int ret = 0, depth;
4549         unsigned int insert_range, rec_range, do_leftright = 0;
4550         struct ocfs2_extent_rec tmprec;
4551         struct ocfs2_extent_list *rightmost_el;
4552         struct ocfs2_extent_rec rec;
4553         struct ocfs2_extent_rec split_rec = *orig_split_rec;
4554         struct ocfs2_insert_type insert;
4555         struct ocfs2_extent_block *eb;
4556
4557 leftright:
4558         /*
4559          * Store a copy of the record on the stack - it might move
4560          * around as the tree is manipulated below.
4561          */
4562         rec = path_leaf_el(path)->l_recs[split_index];
4563
4564         rightmost_el = et->root_el;
4565
4566         depth = le16_to_cpu(rightmost_el->l_tree_depth);
4567         if (depth) {
4568                 BUG_ON(!(*last_eb_bh));
4569                 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4570                 rightmost_el = &eb->h_list;
4571         }
4572
4573         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4574             le16_to_cpu(rightmost_el->l_count)) {
4575                 ret = ocfs2_grow_tree(inode, handle, et,
4576                                       &depth, last_eb_bh, meta_ac);
4577                 if (ret) {
4578                         mlog_errno(ret);
4579                         goto out;
4580                 }
4581         }
4582
4583         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4584         insert.ins_appending = APPEND_NONE;
4585         insert.ins_contig = CONTIG_NONE;
4586         insert.ins_tree_depth = depth;
4587
4588         insert_range = le32_to_cpu(split_rec.e_cpos) +
4589                 le16_to_cpu(split_rec.e_leaf_clusters);
4590         rec_range = le32_to_cpu(rec.e_cpos) +
4591                 le16_to_cpu(rec.e_leaf_clusters);
4592
4593         if (split_rec.e_cpos == rec.e_cpos) {
4594                 insert.ins_split = SPLIT_LEFT;
4595         } else if (insert_range == rec_range) {
4596                 insert.ins_split = SPLIT_RIGHT;
4597         } else {
4598                 /*
4599                  * Left/right split. We fake this as a right split
4600                  * first and then make a second pass as a left split.
4601                  */
4602                 insert.ins_split = SPLIT_RIGHT;
4603
4604                 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4605                                            &rec);
4606
4607                 split_rec = tmprec;
4608
4609                 BUG_ON(do_leftright);
4610                 do_leftright = 1;
4611         }
4612
4613         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4614         if (ret) {
4615                 mlog_errno(ret);
4616                 goto out;
4617         }
4618
4619         if (do_leftright == 1) {
4620                 u32 cpos;
4621                 struct ocfs2_extent_list *el;
4622
4623                 do_leftright++;
4624                 split_rec = *orig_split_rec;
4625
4626                 ocfs2_reinit_path(path, 1);
4627
4628                 cpos = le32_to_cpu(split_rec.e_cpos);
4629                 ret = ocfs2_find_path(inode, path, cpos);
4630                 if (ret) {
4631                         mlog_errno(ret);
4632                         goto out;
4633                 }
4634
4635                 el = path_leaf_el(path);
4636                 split_index = ocfs2_search_extent_list(el, cpos);
4637                 goto leftright;
4638         }
4639 out:
4640
4641         return ret;
4642 }
4643
4644 /*
4645  * Mark part or all of the extent record at split_index in the leaf
4646  * pointed to by path as written. This removes the unwritten
4647  * extent flag.
4648  *
4649  * Care is taken to handle contiguousness so as to not grow the tree.
4650  *
4651  * meta_ac is not strictly necessary - we only truly need it if growth
4652  * of the tree is required. All other cases will degrade into a less
4653  * optimal tree layout.
4654  *
4655  * last_eb_bh should be the rightmost leaf block for any extent
4656  * btree. Since a split may grow the tree or a merge might shrink it,
4657  * the caller cannot trust the contents of that buffer after this call.
4658  *
4659  * This code is optimized for readability - several passes might be
4660  * made over certain portions of the tree. All of those blocks will
4661  * have been brought into cache (and pinned via the journal), so the
4662  * extra overhead is not expressed in terms of disk reads.
4663  */
4664 static int __ocfs2_mark_extent_written(struct inode *inode,
4665                                        struct ocfs2_extent_tree *et,
4666                                        handle_t *handle,
4667                                        struct ocfs2_path *path,
4668                                        int split_index,
4669                                        struct ocfs2_extent_rec *split_rec,
4670                                        struct ocfs2_alloc_context *meta_ac,
4671                                        struct ocfs2_cached_dealloc_ctxt *dealloc)
4672 {
4673         int ret = 0;
4674         struct ocfs2_extent_list *el = path_leaf_el(path);
4675         struct buffer_head *last_eb_bh = NULL;
4676         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
4677         struct ocfs2_merge_ctxt ctxt;
4678         struct ocfs2_extent_list *rightmost_el;
4679
4680         if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
4681                 ret = -EIO;
4682                 mlog_errno(ret);
4683                 goto out;
4684         }
4685
4686         if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
4687             ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
4688              (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
4689                 ret = -EIO;
4690                 mlog_errno(ret);
4691                 goto out;
4692         }
4693
4694         ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
4695                                                             split_index,
4696                                                             split_rec);
4697
4698         /*
4699          * The core merge / split code wants to know how much room is
4700          * left in this inodes allocation tree, so we pass the
4701          * rightmost extent list.
4702          */
4703         if (path->p_tree_depth) {
4704                 struct ocfs2_extent_block *eb;
4705
4706                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4707                                        ocfs2_get_last_eb_blk(et),
4708                                        &last_eb_bh, OCFS2_BH_CACHED, inode);
4709                 if (ret) {
4710                         mlog_exit(ret);
4711                         goto out;
4712                 }
4713
4714                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4715                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4716                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4717                         ret = -EROFS;
4718                         goto out;
4719                 }
4720
4721                 rightmost_el = &eb->h_list;
4722         } else
4723                 rightmost_el = path_root_el(path);
4724
4725         if (rec->e_cpos == split_rec->e_cpos &&
4726             rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4727                 ctxt.c_split_covers_rec = 1;
4728         else
4729                 ctxt.c_split_covers_rec = 0;
4730
4731         ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4732
4733         mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4734              split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4735              ctxt.c_split_covers_rec);
4736
4737         if (ctxt.c_contig_type == CONTIG_NONE) {
4738                 if (ctxt.c_split_covers_rec)
4739                         el->l_recs[split_index] = *split_rec;
4740                 else
4741                         ret = ocfs2_split_and_insert(inode, handle, path, et,
4742                                                      &last_eb_bh, split_index,
4743                                                      split_rec, meta_ac);
4744                 if (ret)
4745                         mlog_errno(ret);
4746         } else {
4747                 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4748                                                 split_index, split_rec,
4749                                                 dealloc, &ctxt, et);
4750                 if (ret)
4751                         mlog_errno(ret);
4752         }
4753
4754 out:
4755         brelse(last_eb_bh);
4756         return ret;
4757 }
4758
4759 /*
4760  * Mark the already-existing extent at cpos as written for len clusters.
4761  *
4762  * If the existing extent is larger than the request, initiate a
4763  * split. An attempt will be made at merging with adjacent extents.
4764  *
4765  * The caller is responsible for passing down meta_ac if we'll need it.
4766  */
4767 int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
4768                               handle_t *handle, u32 cpos, u32 len, u32 phys,
4769                               struct ocfs2_alloc_context *meta_ac,
4770                               struct ocfs2_cached_dealloc_ctxt *dealloc,
4771                               enum ocfs2_extent_tree_type et_type,
4772                               void *private)
4773 {
4774         int ret, index;
4775         u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4776         struct ocfs2_extent_rec split_rec;
4777         struct ocfs2_path *left_path = NULL;
4778         struct ocfs2_extent_list *el;
4779         struct ocfs2_extent_tree *et = NULL;
4780
4781         mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4782              inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4783
4784         if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4785                 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4786                             "that are being written to, but the feature bit "
4787                             "is not set in the super block.",
4788                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
4789                 ret = -EROFS;
4790                 goto out;
4791         }
4792
4793         et = ocfs2_new_extent_tree(root_bh, et_type, private);
4794         if (!et) {
4795                 ret = -ENOMEM;
4796                 mlog_errno(ret);
4797                 goto out;
4798         }
4799
4800         /*
4801          * XXX: This should be fixed up so that we just re-insert the
4802          * next extent records.
4803          */
4804         if (et_type == OCFS2_DINODE_EXTENT)
4805                 ocfs2_extent_map_trunc(inode, 0);
4806
4807         left_path = ocfs2_new_path(et->root_bh, et->root_el);
4808         if (!left_path) {
4809                 ret = -ENOMEM;
4810                 mlog_errno(ret);
4811                 goto out;
4812         }
4813
4814         ret = ocfs2_find_path(inode, left_path, cpos);
4815         if (ret) {
4816                 mlog_errno(ret);
4817                 goto out;
4818         }
4819         el = path_leaf_el(left_path);
4820
4821         index = ocfs2_search_extent_list(el, cpos);
4822         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4823                 ocfs2_error(inode->i_sb,
4824                             "Inode %llu has an extent at cpos %u which can no "
4825                             "longer be found.\n",
4826                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4827                 ret = -EROFS;
4828                 goto out;
4829         }
4830
4831         memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4832         split_rec.e_cpos = cpu_to_le32(cpos);
4833         split_rec.e_leaf_clusters = cpu_to_le16(len);
4834         split_rec.e_blkno = cpu_to_le64(start_blkno);
4835         split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4836         split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4837
4838         ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
4839                                           index, &split_rec, meta_ac,
4840                                           dealloc);
4841         if (ret)
4842                 mlog_errno(ret);
4843
4844 out:
4845         ocfs2_free_path(left_path);
4846         if (et)
4847                 ocfs2_free_extent_tree(et);
4848         return ret;
4849 }
4850
4851 static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
4852                             handle_t *handle, struct ocfs2_path *path,
4853                             int index, u32 new_range,
4854                             struct ocfs2_alloc_context *meta_ac)
4855 {
4856         int ret, depth, credits = handle->h_buffer_credits;
4857         struct buffer_head *last_eb_bh = NULL;
4858         struct ocfs2_extent_block *eb;
4859         struct ocfs2_extent_list *rightmost_el, *el;
4860         struct ocfs2_extent_rec split_rec;
4861         struct ocfs2_extent_rec *rec;
4862         struct ocfs2_insert_type insert;
4863
4864         /*
4865          * Setup the record to split before we grow the tree.
4866          */
4867         el = path_leaf_el(path);
4868         rec = &el->l_recs[index];
4869         ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4870
4871         depth = path->p_tree_depth;
4872         if (depth > 0) {
4873                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4874                                        ocfs2_get_last_eb_blk(et),
4875                                        &last_eb_bh, OCFS2_BH_CACHED, inode);
4876                 if (ret < 0) {
4877                         mlog_errno(ret);
4878                         goto out;
4879                 }
4880
4881                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4882                 rightmost_el = &eb->h_list;
4883         } else
4884                 rightmost_el = path_leaf_el(path);
4885
4886         credits += path->p_tree_depth +
4887                    ocfs2_extend_meta_needed(et->root_el);
4888         ret = ocfs2_extend_trans(handle, credits);
4889         if (ret) {
4890                 mlog_errno(ret);
4891                 goto out;
4892         }
4893
4894         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4895             le16_to_cpu(rightmost_el->l_count)) {
4896                 ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
4897                                       meta_ac);
4898                 if (ret) {
4899                         mlog_errno(ret);
4900                         goto out;
4901                 }
4902         }
4903
4904         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4905         insert.ins_appending = APPEND_NONE;
4906         insert.ins_contig = CONTIG_NONE;
4907         insert.ins_split = SPLIT_RIGHT;
4908         insert.ins_tree_depth = depth;
4909
4910         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4911         if (ret)
4912                 mlog_errno(ret);
4913
4914 out:
4915         brelse(last_eb_bh);
4916         return ret;
4917 }
4918
4919 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4920                               struct ocfs2_path *path, int index,
4921                               struct ocfs2_cached_dealloc_ctxt *dealloc,
4922                               u32 cpos, u32 len,
4923                               struct ocfs2_extent_tree *et)
4924 {
4925         int ret;
4926         u32 left_cpos, rec_range, trunc_range;
4927         int wants_rotate = 0, is_rightmost_tree_rec = 0;
4928         struct super_block *sb = inode->i_sb;
4929         struct ocfs2_path *left_path = NULL;
4930         struct ocfs2_extent_list *el = path_leaf_el(path);
4931         struct ocfs2_extent_rec *rec;
4932         struct ocfs2_extent_block *eb;
4933
4934         if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
4935                 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
4936                 if (ret) {
4937                         mlog_errno(ret);
4938                         goto out;
4939                 }
4940
4941                 index--;
4942         }
4943
4944         if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
4945             path->p_tree_depth) {
4946                 /*
4947                  * Check whether this is the rightmost tree record. If
4948                  * we remove all of this record or part of its right
4949                  * edge then an update of the record lengths above it
4950                  * will be required.
4951                  */
4952                 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
4953                 if (eb->h_next_leaf_blk == 0)
4954                         is_rightmost_tree_rec = 1;
4955         }
4956
4957         rec = &el->l_recs[index];
4958         if (index == 0 && path->p_tree_depth &&
4959             le32_to_cpu(rec->e_cpos) == cpos) {
4960                 /*
4961                  * Changing the leftmost offset (via partial or whole
4962                  * record truncate) of an interior (or rightmost) path
4963                  * means we have to update the subtree that is formed
4964                  * by this leaf and the one to it's left.
4965                  *
4966                  * There are two cases we can skip:
4967                  *   1) Path is the leftmost one in our inode tree.
4968                  *   2) The leaf is rightmost and will be empty after
4969                  *      we remove the extent record - the rotate code
4970                  *      knows how to update the newly formed edge.
4971                  */
4972
4973                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
4974                                                     &left_cpos);
4975                 if (ret) {
4976                         mlog_errno(ret);
4977                         goto out;
4978                 }
4979
4980                 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
4981                         left_path = ocfs2_new_path(path_root_bh(path),
4982                                                    path_root_el(path));
4983                         if (!left_path) {
4984                                 ret = -ENOMEM;
4985                                 mlog_errno(ret);
4986                                 goto out;
4987                         }
4988
4989                         ret = ocfs2_find_path(inode, left_path, left_cpos);
4990                         if (ret) {
4991                                 mlog_errno(ret);
4992                                 goto out;
4993                         }
4994                 }
4995         }
4996
4997         ret = ocfs2_extend_rotate_transaction(handle, 0,
4998                                               handle->h_buffer_credits,
4999                                               path);
5000         if (ret) {
5001                 mlog_errno(ret);
5002                 goto out;
5003         }
5004
5005         ret = ocfs2_journal_access_path(inode, handle, path);
5006         if (ret) {
5007                 mlog_errno(ret);
5008                 goto out;
5009         }
5010
5011         ret = ocfs2_journal_access_path(inode, handle, left_path);
5012         if (ret) {
5013                 mlog_errno(ret);
5014                 goto out;
5015         }
5016
5017         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5018         trunc_range = cpos + len;
5019
5020         if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5021                 int next_free;
5022
5023                 memset(rec, 0, sizeof(*rec));
5024                 ocfs2_cleanup_merge(el, index);
5025                 wants_rotate = 1;
5026
5027                 next_free = le16_to_cpu(el->l_next_free_rec);
5028                 if (is_rightmost_tree_rec && next_free > 1) {
5029                         /*
5030                          * We skip the edge update if this path will
5031                          * be deleted by the rotate code.
5032                          */
5033                         rec = &el->l_recs[next_free - 1];
5034                         ocfs2_adjust_rightmost_records(inode, handle, path,
5035                                                        rec);
5036                 }
5037         } else if (le32_to_cpu(rec->e_cpos) == cpos) {
5038                 /* Remove leftmost portion of the record. */
5039                 le32_add_cpu(&rec->e_cpos, len);
5040                 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5041                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5042         } else if (rec_range == trunc_range) {
5043                 /* Remove rightmost portion of the record */
5044                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5045                 if (is_rightmost_tree_rec)
5046                         ocfs2_adjust_rightmost_records(inode, handle, path, rec);
5047         } else {
5048                 /* Caller should have trapped this. */
5049                 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
5050                      "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
5051                      le32_to_cpu(rec->e_cpos),
5052                      le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5053                 BUG();
5054         }
5055
5056         if (left_path) {
5057                 int subtree_index;
5058
5059                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
5060                 ocfs2_complete_edge_insert(inode, handle, left_path, path,
5061                                            subtree_index);
5062         }
5063
5064         ocfs2_journal_dirty(handle, path_leaf_bh(path));
5065
5066         ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5067         if (ret) {
5068                 mlog_errno(ret);
5069                 goto out;
5070         }
5071
5072 out:
5073         ocfs2_free_path(left_path);
5074         return ret;
5075 }
5076
5077 int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
5078                         u32 cpos, u32 len, handle_t *handle,
5079                         struct ocfs2_alloc_context *meta_ac,
5080                         struct ocfs2_cached_dealloc_ctxt *dealloc,
5081                         enum ocfs2_extent_tree_type et_type,
5082                         void *private)
5083 {
5084         int ret, index;
5085         u32 rec_range, trunc_range;
5086         struct ocfs2_extent_rec *rec;
5087         struct ocfs2_extent_list *el;
5088         struct ocfs2_path *path = NULL;
5089         struct ocfs2_extent_tree *et = NULL;
5090
5091         et = ocfs2_new_extent_tree(root_bh, et_type, private);
5092         if (!et) {
5093                 ret = -ENOMEM;
5094                 mlog_errno(ret);
5095                 goto out;
5096         }
5097
5098         ocfs2_extent_map_trunc(inode, 0);
5099
5100         path = ocfs2_new_path(et->root_bh, et->root_el);
5101         if (!path) {
5102                 ret = -ENOMEM;
5103                 mlog_errno(ret);
5104                 goto out;
5105         }
5106
5107         ret = ocfs2_find_path(inode, path, cpos);
5108         if (ret) {
5109                 mlog_errno(ret);
5110                 goto out;
5111         }
5112
5113         el = path_leaf_el(path);
5114         index = ocfs2_search_extent_list(el, cpos);
5115         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5116                 ocfs2_error(inode->i_sb,
5117                             "Inode %llu has an extent at cpos %u which can no "
5118                             "longer be found.\n",
5119                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
5120                 ret = -EROFS;
5121                 goto out;
5122         }
5123
5124         /*
5125          * We have 3 cases of extent removal:
5126          *   1) Range covers the entire extent rec
5127          *   2) Range begins or ends on one edge of the extent rec
5128          *   3) Range is in the middle of the extent rec (no shared edges)
5129          *
5130          * For case 1 we remove the extent rec and left rotate to
5131          * fill the hole.
5132          *
5133          * For case 2 we just shrink the existing extent rec, with a
5134          * tree update if the shrinking edge is also the edge of an
5135          * extent block.
5136          *
5137          * For case 3 we do a right split to turn the extent rec into
5138          * something case 2 can handle.
5139          */
5140         rec = &el->l_recs[index];
5141         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5142         trunc_range = cpos + len;
5143
5144         BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5145
5146         mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
5147              "(cpos %u, len %u)\n",
5148              (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
5149              le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5150
5151         if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5152                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5153                                          cpos, len, et);
5154                 if (ret) {
5155                         mlog_errno(ret);
5156                         goto out;
5157                 }
5158         } else {
5159                 ret = ocfs2_split_tree(inode, et, handle, path, index,
5160                                        trunc_range, meta_ac);
5161                 if (ret) {
5162                         mlog_errno(ret);
5163                         goto out;
5164                 }
5165
5166                 /*
5167                  * The split could have manipulated the tree enough to
5168                  * move the record location, so we have to look for it again.
5169                  */
5170                 ocfs2_reinit_path(path, 1);
5171
5172                 ret = ocfs2_find_path(inode, path, cpos);
5173                 if (ret) {
5174                         mlog_errno(ret);
5175                         goto out;
5176                 }
5177
5178                 el = path_leaf_el(path);
5179                 index = ocfs2_search_extent_list(el, cpos);
5180                 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5181                         ocfs2_error(inode->i_sb,
5182                                     "Inode %llu: split at cpos %u lost record.",
5183                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5184                                     cpos);
5185                         ret = -EROFS;
5186                         goto out;
5187                 }
5188
5189                 /*
5190                  * Double check our values here. If anything is fishy,
5191                  * it's easier to catch it at the top level.
5192                  */
5193                 rec = &el->l_recs[index];
5194                 rec_range = le32_to_cpu(rec->e_cpos) +
5195                         ocfs2_rec_clusters(el, rec);
5196                 if (rec_range != trunc_range) {
5197                         ocfs2_error(inode->i_sb,
5198                                     "Inode %llu: error after split at cpos %u"
5199                                     "trunc len %u, existing record is (%u,%u)",
5200                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5201                                     cpos, len, le32_to_cpu(rec->e_cpos),
5202                                     ocfs2_rec_clusters(el, rec));
5203                         ret = -EROFS;
5204                         goto out;
5205                 }
5206
5207                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5208                                          cpos, len, et);
5209                 if (ret) {
5210                         mlog_errno(ret);
5211                         goto out;
5212                 }
5213         }
5214
5215 out:
5216         ocfs2_free_path(path);
5217         if (et)
5218                 ocfs2_free_extent_tree(et);
5219         return ret;
5220 }
5221
5222 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5223 {
5224         struct buffer_head *tl_bh = osb->osb_tl_bh;
5225         struct ocfs2_dinode *di;
5226         struct ocfs2_truncate_log *tl;
5227
5228         di = (struct ocfs2_dinode *) tl_bh->b_data;
5229         tl = &di->id2.i_dealloc;
5230
5231         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5232                         "slot %d, invalid truncate log parameters: used = "
5233                         "%u, count = %u\n", osb->slot_num,
5234                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5235         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5236 }
5237
5238 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5239                                            unsigned int new_start)
5240 {
5241         unsigned int tail_index;
5242         unsigned int current_tail;
5243
5244         /* No records, nothing to coalesce */
5245         if (!le16_to_cpu(tl->tl_used))
5246                 return 0;
5247
5248         tail_index = le16_to_cpu(tl->tl_used) - 1;
5249         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5250         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5251
5252         return current_tail == new_start;
5253 }
5254
5255 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5256                               handle_t *handle,
5257                               u64 start_blk,
5258                               unsigned int num_clusters)
5259 {
5260         int status, index;
5261         unsigned int start_cluster, tl_count;
5262         struct inode *tl_inode = osb->osb_tl_inode;
5263         struct buffer_head *tl_bh = osb->osb_tl_bh;
5264         struct ocfs2_dinode *di;
5265         struct ocfs2_truncate_log *tl;
5266
5267         mlog_entry("start_blk = %llu, num_clusters = %u\n",
5268                    (unsigned long long)start_blk, num_clusters);
5269
5270         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5271
5272         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5273
5274         di = (struct ocfs2_dinode *) tl_bh->b_data;
5275         tl = &di->id2.i_dealloc;
5276         if (!OCFS2_IS_VALID_DINODE(di)) {
5277                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5278                 status = -EIO;
5279                 goto bail;
5280         }
5281
5282         tl_count = le16_to_cpu(tl->tl_count);
5283         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5284                         tl_count == 0,
5285                         "Truncate record count on #%llu invalid "
5286                         "wanted %u, actual %u\n",
5287                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5288                         ocfs2_truncate_recs_per_inode(osb->sb),
5289                         le16_to_cpu(tl->tl_count));
5290
5291         /* Caller should have known to flush before calling us. */
5292         index = le16_to_cpu(tl->tl_used);
5293         if (index >= tl_count) {
5294                 status = -ENOSPC;
5295                 mlog_errno(status);
5296                 goto bail;
5297         }
5298
5299         status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5300                                       OCFS2_JOURNAL_ACCESS_WRITE);
5301         if (status < 0) {
5302                 mlog_errno(status);
5303                 goto bail;
5304         }
5305
5306         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5307              "%llu (index = %d)\n", num_clusters, start_cluster,
5308              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5309
5310         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5311                 /*
5312                  * Move index back to the record we are coalescing with.
5313                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5314                  */
5315                 index--;
5316
5317                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5318                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5319                      index, le32_to_cpu(tl->tl_recs[index].t_start),
5320                      num_clusters);
5321         } else {
5322                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5323                 tl->tl_used = cpu_to_le16(index + 1);
5324         }
5325         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5326
5327         status = ocfs2_journal_dirty(handle, tl_bh);
5328         if (status < 0) {
5329                 mlog_errno(status);
5330                 goto bail;
5331         }
5332
5333 bail:
5334         mlog_exit(status);
5335         return status;
5336 }
5337
5338 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5339                                          handle_t *handle,
5340                                          struct inode *data_alloc_inode,
5341                                          struct buffer_head *data_alloc_bh)
5342 {
5343         int status = 0;
5344         int i;
5345         unsigned int num_clusters;
5346         u64 start_blk;
5347         struct ocfs2_truncate_rec rec;
5348         struct ocfs2_dinode *di;
5349         struct ocfs2_truncate_log *tl;
5350         struct inode *tl_inode = osb->osb_tl_inode;
5351         struct buffer_head *tl_bh = osb->osb_tl_bh;
5352
5353         mlog_entry_void();
5354
5355         di = (struct ocfs2_dinode *) tl_bh->b_data;
5356         tl = &di->id2.i_dealloc;
5357         i = le16_to_cpu(tl->tl_used) - 1;
5358         while (i >= 0) {
5359                 /* Caller has given us at least enough credits to
5360                  * update the truncate log dinode */
5361                 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5362                                               OCFS2_JOURNAL_ACCESS_WRITE);
5363                 if (status < 0) {
5364                         mlog_errno(status);
5365                         goto bail;
5366                 }
5367
5368                 tl->tl_used = cpu_to_le16(i);
5369
5370                 status = ocfs2_journal_dirty(handle, tl_bh);
5371                 if (status < 0) {
5372                         mlog_errno(status);
5373                         goto bail;
5374                 }
5375
5376                 /* TODO: Perhaps we can calculate the bulk of the
5377                  * credits up front rather than extending like
5378                  * this. */
5379                 status = ocfs2_extend_trans(handle,
5380                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5381                 if (status < 0) {
5382                         mlog_errno(status);
5383                         goto bail;
5384                 }
5385
5386                 rec = tl->tl_recs[i];
5387                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5388                                                     le32_to_cpu(rec.t_start));
5389                 num_clusters = le32_to_cpu(rec.t_clusters);
5390
5391                 /* if start_blk is not set, we ignore the record as
5392                  * invalid. */
5393                 if (start_blk) {
5394                         mlog(0, "free record %d, start = %u, clusters = %u\n",
5395                              i, le32_to_cpu(rec.t_start), num_clusters);
5396
5397                         status = ocfs2_free_clusters(handle, data_alloc_inode,
5398                                                      data_alloc_bh, start_blk,
5399                                                      num_clusters);
5400                         if (status < 0) {
5401                                 mlog_errno(status);
5402                                 goto bail;
5403                         }
5404                 }
5405                 i--;
5406         }
5407
5408 bail:
5409         mlog_exit(status);
5410         return status;
5411 }
5412
5413 /* Expects you to already be holding tl_inode->i_mutex */
5414 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5415 {
5416         int status;
5417         unsigned int num_to_flush;
5418         handle_t *handle;
5419         struct inode *tl_inode = osb->osb_tl_inode;
5420         struct inode *data_alloc_inode = NULL;
5421         struct buffer_head *tl_bh = osb->osb_tl_bh;
5422         struct buffer_head *data_alloc_bh = NULL;
5423         struct ocfs2_dinode *di;
5424         struct ocfs2_truncate_log *tl;
5425
5426         mlog_entry_void();
5427
5428         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5429
5430         di = (struct ocfs2_dinode *) tl_bh->b_data;
5431         tl = &di->id2.i_dealloc;
5432         if (!OCFS2_IS_VALID_DINODE(di)) {
5433                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5434                 status = -EIO;
5435                 goto out;
5436         }
5437
5438         num_to_flush = le16_to_cpu(tl->tl_used);
5439         mlog(0, "Flush %u records from truncate log #%llu\n",
5440              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5441         if (!num_to_flush) {
5442                 status = 0;
5443                 goto out;
5444         }
5445
5446         data_alloc_inode = ocfs2_get_system_file_inode(osb,
5447                                                        GLOBAL_BITMAP_SYSTEM_INODE,
5448                                                        OCFS2_INVALID_SLOT);
5449         if (!data_alloc_inode) {
5450                 status = -EINVAL;
5451                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
5452                 goto out;
5453         }
5454
5455         mutex_lock(&data_alloc_inode->i_mutex);
5456
5457         status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5458         if (status < 0) {
5459                 mlog_errno(status);
5460                 goto out_mutex;
5461         }
5462
5463         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5464         if (IS_ERR(handle)) {
5465                 status = PTR_ERR(handle);
5466                 mlog_errno(status);
5467                 goto out_unlock;
5468         }
5469
5470         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5471                                                data_alloc_bh);
5472         if (status < 0)
5473                 mlog_errno(status);
5474
5475         ocfs2_commit_trans(osb, handle);
5476
5477 out_unlock:
5478         brelse(data_alloc_bh);
5479         ocfs2_inode_unlock(data_alloc_inode, 1);
5480
5481 out_mutex:
5482         mutex_unlock(&data_alloc_inode->i_mutex);
5483         iput(data_alloc_inode);
5484
5485 out:
5486         mlog_exit(status);
5487         return status;
5488 }
5489
5490 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5491 {
5492         int status;
5493         struct inode *tl_inode = osb->osb_tl_inode;
5494
5495         mutex_lock(&tl_inode->i_mutex);
5496         status = __ocfs2_flush_truncate_log(osb);
5497         mutex_unlock(&tl_inode->i_mutex);
5498
5499         return status;
5500 }
5501
5502 static void ocfs2_truncate_log_worker(struct work_struct *work)
5503 {
5504         int status;
5505         struct ocfs2_super *osb =
5506                 container_of(work, struct ocfs2_super,
5507                              osb_truncate_log_wq.work);
5508
5509         mlog_entry_void();
5510
5511         status = ocfs2_flush_truncate_log(osb);
5512         if (status < 0)
5513                 mlog_errno(status);
5514         else
5515                 ocfs2_init_inode_steal_slot(osb);
5516
5517         mlog_exit(status);
5518 }
5519
5520 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5521 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5522                                        int cancel)
5523 {
5524         if (osb->osb_tl_inode) {
5525                 /* We want to push off log flushes while truncates are
5526                  * still running. */
5527                 if (cancel)
5528                         cancel_delayed_work(&osb->osb_truncate_log_wq);
5529
5530                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5531                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5532         }
5533 }
5534
5535 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5536                                        int slot_num,
5537                                        struct inode **tl_inode,
5538                                        struct buffer_head **tl_bh)
5539 {
5540         int status;
5541         struct inode *inode = NULL;
5542         struct buffer_head *bh = NULL;
5543
5544         inode = ocfs2_get_system_file_inode(osb,
5545                                            TRUNCATE_LOG_SYSTEM_INODE,
5546                                            slot_num);
5547         if (!inode) {
5548                 status = -EINVAL;
5549                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5550                 goto bail;
5551         }
5552
5553         status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
5554                                   OCFS2_BH_CACHED, inode);
5555         if (status < 0) {
5556                 iput(inode);
5557                 mlog_errno(status);
5558                 goto bail;
5559         }
5560
5561         *tl_inode = inode;
5562         *tl_bh    = bh;
5563 bail:
5564         mlog_exit(status);
5565         return status;
5566 }
5567
5568 /* called during the 1st stage of node recovery. we stamp a clean
5569  * truncate log and pass back a copy for processing later. if the
5570  * truncate log does not require processing, a *tl_copy is set to
5571  * NULL. */
5572 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5573                                       int slot_num,
5574                                       struct ocfs2_dinode **tl_copy)
5575 {
5576         int status;
5577         struct inode *tl_inode = NULL;
5578         struct buffer_head *tl_bh = NULL;
5579         struct ocfs2_dinode *di;
5580         struct ocfs2_truncate_log *tl;
5581
5582         *tl_copy = NULL;
5583
5584         mlog(0, "recover truncate log from slot %d\n", slot_num);
5585
5586         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5587         if (status < 0) {
5588                 mlog_errno(status);
5589                 goto bail;
5590         }
5591
5592         di = (struct ocfs2_dinode *) tl_bh->b_data;
5593         tl = &di->id2.i_dealloc;
5594         if (!OCFS2_IS_VALID_DINODE(di)) {
5595                 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
5596                 status = -EIO;
5597                 goto bail;
5598         }
5599
5600         if (le16_to_cpu(tl->tl_used)) {
5601                 mlog(0, "We'll have %u logs to recover\n",
5602                      le16_to_cpu(tl->tl_used));
5603
5604                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
5605                 if (!(*tl_copy)) {
5606                         status = -ENOMEM;
5607                         mlog_errno(status);
5608                         goto bail;
5609                 }
5610
5611                 /* Assuming the write-out below goes well, this copy
5612                  * will be passed back to recovery for processing. */
5613                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
5614
5615                 /* All we need to do to clear the truncate log is set
5616                  * tl_used. */
5617                 tl->tl_used = 0;
5618
5619                 status = ocfs2_write_block(osb, tl_bh, tl_inode);
5620                 if (status < 0) {
5621                         mlog_errno(status);
5622                         goto bail;
5623                 }
5624         }
5625
5626 bail:
5627         if (tl_inode)
5628                 iput(tl_inode);
5629         if (tl_bh)
5630                 brelse(tl_bh);
5631
5632         if (status < 0 && (*tl_copy)) {
5633                 kfree(*tl_copy);
5634                 *tl_copy = NULL;
5635         }
5636
5637         mlog_exit(status);
5638         return status;
5639 }
5640
5641 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
5642                                          struct ocfs2_dinode *tl_copy)
5643 {
5644         int status = 0;
5645         int i;
5646         unsigned int clusters, num_recs, start_cluster;
5647         u64 start_blk;
5648         handle_t *handle;
5649         struct inode *tl_inode = osb->osb_tl_inode;
5650         struct ocfs2_truncate_log *tl;
5651
5652         mlog_entry_void();
5653
5654         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
5655                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
5656                 return -EINVAL;
5657         }
5658
5659         tl = &tl_copy->id2.i_dealloc;
5660         num_recs = le16_to_cpu(tl->tl_used);
5661         mlog(0, "cleanup %u records from %llu\n", num_recs,
5662              (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
5663
5664         mutex_lock(&tl_inode->i_mutex);
5665         for(i = 0; i < num_recs; i++) {
5666                 if (ocfs2_truncate_log_needs_flush(osb)) {
5667                         status = __ocfs2_flush_truncate_log(osb);
5668                         if (status < 0) {
5669                                 mlog_errno(status);
5670                                 goto bail_up;
5671                         }
5672                 }
5673
5674                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5675                 if (IS_ERR(handle)) {
5676                         status = PTR_ERR(handle);
5677                         mlog_errno(status);
5678                         goto bail_up;
5679                 }
5680
5681                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
5682                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
5683                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
5684
5685                 status = ocfs2_truncate_log_append(osb, handle,
5686                                                    start_blk, clusters);
5687                 ocfs2_commit_trans(osb, handle);
5688                 if (status < 0) {
5689                         mlog_errno(status);
5690                         goto bail_up;
5691                 }
5692         }
5693
5694 bail_up:
5695         mutex_unlock(&tl_inode->i_mutex);
5696
5697         mlog_exit(status);
5698         return status;
5699 }
5700
5701 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
5702 {
5703         int status;
5704         struct inode *tl_inode = osb->osb_tl_inode;
5705
5706         mlog_entry_void();
5707
5708         if (tl_inode) {
5709                 cancel_delayed_work(&osb->osb_truncate_log_wq);
5710                 flush_workqueue(ocfs2_wq);
5711
5712                 status = ocfs2_flush_truncate_log(osb);
5713                 if (status < 0)
5714                         mlog_errno(status);
5715
5716                 brelse(osb->osb_tl_bh);
5717                 iput(osb->osb_tl_inode);
5718         }
5719
5720         mlog_exit_void();
5721 }
5722
5723 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5724 {
5725         int status;
5726         struct inode *tl_inode = NULL;
5727         struct buffer_head *tl_bh = NULL;
5728
5729         mlog_entry_void();
5730
5731         status = ocfs2_get_truncate_log_info(osb,
5732                                              osb->slot_num,
5733                                              &tl_inode,
5734                                              &tl_bh);
5735         if (status < 0)
5736                 mlog_errno(status);
5737
5738         /* ocfs2_truncate_log_shutdown keys on the existence of
5739          * osb->osb_tl_inode so we don't set any of the osb variables
5740          * until we're sure all is well. */
5741         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
5742                           ocfs2_truncate_log_worker);
5743         osb->osb_tl_bh    = tl_bh;
5744         osb->osb_tl_inode = tl_inode;
5745
5746         mlog_exit(status);
5747         return status;
5748 }
5749
5750 /*
5751  * Delayed de-allocation of suballocator blocks.
5752  *
5753  * Some sets of block de-allocations might involve multiple suballocator inodes.
5754  *
5755  * The locking for this can get extremely complicated, especially when
5756  * the suballocator inodes to delete from aren't known until deep
5757  * within an unrelated codepath.
5758  *
5759  * ocfs2_extent_block structures are a good example of this - an inode
5760  * btree could have been grown by any number of nodes each allocating
5761  * out of their own suballoc inode.
5762  *
5763  * These structures allow the delay of block de-allocation until a
5764  * later time, when locking of multiple cluster inodes won't cause
5765  * deadlock.
5766  */
5767
5768 /*
5769  * Describes a single block free from a suballocator
5770  */
5771 struct ocfs2_cached_block_free {
5772         struct ocfs2_cached_block_free          *free_next;
5773         u64                                     free_blk;
5774         unsigned int                            free_bit;
5775 };
5776
5777 struct ocfs2_per_slot_free_list {
5778         struct ocfs2_per_slot_free_list         *f_next_suballocator;
5779         int                                     f_inode_type;
5780         int                                     f_slot;
5781         struct ocfs2_cached_block_free          *f_first;
5782 };
5783
5784 static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5785                                    int sysfile_type,
5786                                    int slot,
5787                                    struct ocfs2_cached_block_free *head)
5788 {
5789         int ret;
5790         u64 bg_blkno;
5791         handle_t *handle;
5792         struct inode *inode;
5793         struct buffer_head *di_bh = NULL;
5794         struct ocfs2_cached_block_free *tmp;
5795
5796         inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5797         if (!inode) {
5798                 ret = -EINVAL;
5799                 mlog_errno(ret);
5800                 goto out;
5801         }
5802
5803         mutex_lock(&inode->i_mutex);
5804
5805         ret = ocfs2_inode_lock(inode, &di_bh, 1);
5806         if (ret) {
5807                 mlog_errno(ret);
5808                 goto out_mutex;
5809         }
5810
5811         handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5812         if (IS_ERR(handle)) {
5813                 ret = PTR_ERR(handle);
5814                 mlog_errno(ret);
5815                 goto out_unlock;
5816         }
5817
5818         while (head) {
5819                 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5820                                                       head->free_bit);
5821                 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5822                      head->free_bit, (unsigned long long)head->free_blk);
5823
5824                 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5825                                                head->free_bit, bg_blkno, 1);
5826                 if (ret) {
5827                         mlog_errno(ret);
5828                         goto out_journal;
5829                 }
5830
5831                 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5832                 if (ret) {
5833                         mlog_errno(ret);
5834                         goto out_journal;
5835                 }
5836
5837                 tmp = head;
5838                 head = head->free_next;
5839                 kfree(tmp);
5840         }
5841
5842 out_journal:
5843         ocfs2_commit_trans(osb, handle);
5844
5845 out_unlock:
5846         ocfs2_inode_unlock(inode, 1);
5847         brelse(di_bh);
5848 out_mutex:
5849         mutex_unlock(&inode->i_mutex);
5850         iput(inode);
5851 out:
5852         while(head) {
5853                 /* Premature exit may have left some dangling items. */
5854                 tmp = head;
5855                 head = head->free_next;
5856                 kfree(tmp);
5857         }
5858
5859         return ret;
5860 }
5861
5862 int ocfs2_run_deallocs(struct ocfs2_super *osb,
5863                        struct ocfs2_cached_dealloc_ctxt *ctxt)
5864 {
5865         int ret = 0, ret2;
5866         struct ocfs2_per_slot_free_list *fl;
5867
5868         if (!ctxt)
5869                 return 0;
5870
5871         while (ctxt->c_first_suballocator) {
5872                 fl = ctxt->c_first_suballocator;
5873
5874                 if (fl->f_first) {
5875                         mlog(0, "Free items: (type %u, slot %d)\n",
5876                              fl->f_inode_type, fl->f_slot);
5877                         ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
5878                                                        fl->f_slot, fl->f_first);
5879                         if (ret2)
5880                                 mlog_errno(ret2);
5881                         if (!ret)
5882                                 ret = ret2;
5883                 }
5884
5885                 ctxt->c_first_suballocator = fl->f_next_suballocator;
5886                 kfree(fl);
5887         }
5888
5889         return ret;
5890 }
5891
5892 static struct ocfs2_per_slot_free_list *
5893 ocfs2_find_per_slot_free_list(int type,
5894                               int slot,
5895                               struct ocfs2_cached_dealloc_ctxt *ctxt)
5896 {
5897         struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
5898
5899         while (fl) {
5900                 if (fl->f_inode_type == type && fl->f_slot == slot)
5901                         return fl;
5902
5903                 fl = fl->f_next_suballocator;
5904         }
5905
5906         fl = kmalloc(sizeof(*fl), GFP_NOFS);
5907         if (fl) {
5908                 fl->f_inode_type = type;
5909                 fl->f_slot = slot;
5910                 fl->f_first = NULL;
5911                 fl->f_next_suballocator = ctxt->c_first_suballocator;
5912
5913                 ctxt->c_first_suballocator = fl;
5914         }
5915         return fl;
5916 }
5917
5918 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
5919                                      int type, int slot, u64 blkno,
5920                                      unsigned int bit)
5921 {
5922         int ret;
5923         struct ocfs2_per_slot_free_list *fl;
5924         struct ocfs2_cached_block_free *item;
5925
5926         fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
5927         if (fl == NULL) {
5928                 ret = -ENOMEM;
5929                 mlog_errno(ret);
5930                 goto out;
5931         }
5932
5933         item = kmalloc(sizeof(*item), GFP_NOFS);
5934         if (item == NULL) {
5935                 ret = -ENOMEM;
5936                 mlog_errno(ret);
5937                 goto out;
5938         }
5939
5940         mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
5941              type, slot, bit, (unsigned long long)blkno);
5942
5943         item->free_blk = blkno;
5944         item->free_bit = bit;
5945         item->free_next = fl->f_first;
5946
5947         fl->f_first = item;
5948
5949         ret = 0;
5950 out:
5951         return ret;
5952 }
5953
5954 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
5955                                          struct ocfs2_extent_block *eb)
5956 {
5957         return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
5958                                          le16_to_cpu(eb->h_suballoc_slot),
5959                                          le64_to_cpu(eb->h_blkno),
5960                                          le16_to_cpu(eb->h_suballoc_bit));
5961 }
5962
5963 /* This function will figure out whether the currently last extent
5964  * block will be deleted, and if it will, what the new last extent
5965  * block will be so we can update his h_next_leaf_blk field, as well
5966  * as the dinodes i_last_eb_blk */
5967 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
5968                                        unsigned int clusters_to_del,
5969                                        struct ocfs2_path *path,
5970                                        struct buffer_head **new_last_eb)
5971 {
5972         int next_free, ret = 0;
5973         u32 cpos;
5974         struct ocfs2_extent_rec *rec;
5975         struct ocfs2_extent_block *eb;
5976         struct ocfs2_extent_list *el;
5977         struct buffer_head *bh = NULL;
5978
5979         *new_last_eb = NULL;
5980
5981         /* we have no tree, so of course, no last_eb. */
5982         if (!path->p_tree_depth)
5983                 goto out;
5984
5985         /* trunc to zero special case - this makes tree_depth = 0
5986          * regardless of what it is.  */
5987         if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
5988                 goto out;
5989
5990         el = path_leaf_el(path);
5991         BUG_ON(!el->l_next_free_rec);
5992
5993         /*
5994          * Make sure that this extent list will actually be empty
5995          * after we clear away the data. We can shortcut out if
5996          * there's more than one non-empty extent in the
5997          * list. Otherwise, a check of the remaining extent is
5998          * necessary.
5999          */
6000         next_free = le16_to_cpu(el->l_next_free_rec);
6001         rec = NULL;
6002         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6003                 if (next_free > 2)
6004                         goto out;
6005
6006                 /* We may have a valid extent in index 1, check it. */
6007                 if (next_free == 2)
6008                         rec = &el->l_recs[1];
6009
6010                 /*
6011                  * Fall through - no more nonempty extents, so we want
6012                  * to delete this leaf.
6013                  */
6014         } else {
6015                 if (next_free > 1)
6016                         goto out;
6017
6018                 rec = &el->l_recs[0];
6019         }
6020
6021         if (rec) {
6022                 /*
6023                  * Check it we'll only be trimming off the end of this
6024                  * cluster.
6025                  */
6026                 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6027                         goto out;
6028         }
6029
6030         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6031         if (ret) {
6032                 mlog_errno(ret);
6033                 goto out;
6034         }
6035
6036         ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
6037         if (ret) {
6038                 mlog_errno(ret);
6039                 goto out;
6040         }
6041
6042         eb = (struct ocfs2_extent_block *) bh->b_data;
6043         el = &eb->h_list;
6044         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6045                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6046                 ret = -EROFS;
6047                 goto out;
6048         }
6049
6050         *new_last_eb = bh;
6051         get_bh(*new_last_eb);
6052         mlog(0, "returning block %llu, (cpos: %u)\n",
6053              (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6054 out:
6055         brelse(bh);
6056
6057         return ret;
6058 }
6059
6060 /*
6061  * Trim some clusters off the rightmost edge of a tree. Only called
6062  * during truncate.
6063  *
6064  * The caller needs to:
6065  *   - start journaling of each path component.
6066  *   - compute and fully set up any new last ext block
6067  */
6068 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6069                            handle_t *handle, struct ocfs2_truncate_context *tc,
6070                            u32 clusters_to_del, u64 *delete_start)
6071 {
6072         int ret, i, index = path->p_tree_depth;
6073         u32 new_edge = 0;
6074         u64 deleted_eb = 0;
6075         struct buffer_head *bh;
6076         struct ocfs2_extent_list *el;
6077         struct ocfs2_extent_rec *rec;
6078
6079         *delete_start = 0;
6080
6081         while (index >= 0) {
6082                 bh = path->p_node[index].bh;
6083                 el = path->p_node[index].el;
6084
6085                 mlog(0, "traveling tree (index = %d, block = %llu)\n",
6086                      index,  (unsigned long long)bh->b_blocknr);
6087
6088                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6089
6090                 if (index !=
6091                     (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6092                         ocfs2_error(inode->i_sb,
6093                                     "Inode %lu has invalid ext. block %llu",
6094                                     inode->i_ino,
6095                                     (unsigned long long)bh->b_blocknr);
6096                         ret = -EROFS;
6097                         goto out;
6098                 }
6099
6100 find_tail_record:
6101                 i = le16_to_cpu(el->l_next_free_rec) - 1;
6102                 rec = &el->l_recs[i];
6103
6104                 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6105                      "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6106                      ocfs2_rec_clusters(el, rec),
6107                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6108                      le16_to_cpu(el->l_next_free_rec));
6109
6110                 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6111
6112                 if (le16_to_cpu(el->l_tree_depth) == 0) {
6113                         /*
6114                          * If the leaf block contains a single empty
6115                          * extent and no records, we can just remove
6116                          * the block.
6117                          */
6118                         if (i == 0 && ocfs2_is_empty_extent(rec)) {
6119                                 memset(rec, 0,
6120                                        sizeof(struct ocfs2_extent_rec));
6121                                 el->l_next_free_rec = cpu_to_le16(0);
6122
6123                                 goto delete;
6124                         }
6125
6126                         /*
6127                          * Remove any empty extents by shifting things
6128                          * left. That should make life much easier on
6129                          * the code below. This condition is rare
6130                          * enough that we shouldn't see a performance
6131                          * hit.
6132                          */
6133                         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6134                                 le16_add_cpu(&el->l_next_free_rec, -1);
6135
6136                                 for(i = 0;
6137                                     i < le16_to_cpu(el->l_next_free_rec); i++)
6138                                         el->l_recs[i] = el->l_recs[i + 1];
6139
6140                                 memset(&el->l_recs[i], 0,
6141                                        sizeof(struct ocfs2_extent_rec));
6142
6143                                 /*
6144                                  * We've modified our extent list. The
6145                                  * simplest way to handle this change
6146                                  * is to being the search from the
6147                                  * start again.
6148                                  */
6149                                 goto find_tail_record;
6150                         }
6151
6152                         le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6153
6154                         /*
6155                          * We'll use "new_edge" on our way back up the
6156                          * tree to know what our rightmost cpos is.
6157                          */
6158                         new_edge = le16_to_cpu(rec->e_leaf_clusters);
6159                         new_edge += le32_to_cpu(rec->e_cpos);
6160
6161                         /*
6162                          * The caller will use this to delete data blocks.
6163                          */
6164                         *delete_start = le64_to_cpu(rec->e_blkno)
6165                                 + ocfs2_clusters_to_blocks(inode->i_sb,
6166                                         le16_to_cpu(rec->e_leaf_clusters));
6167
6168                         /*
6169                          * If it's now empty, remove this record.
6170                          */
6171                         if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6172                                 memset(rec, 0,
6173                                        sizeof(struct ocfs2_extent_rec));
6174                                 le16_add_cpu(&el->l_next_free_rec, -1);
6175                         }
6176                 } else {
6177                         if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6178                                 memset(rec, 0,
6179                                        sizeof(struct ocfs2_extent_rec));
6180                                 le16_add_cpu(&el->l_next_free_rec, -1);
6181
6182                                 goto delete;
6183                         }
6184
6185                         /* Can this actually happen? */
6186                         if (le16_to_cpu(el->l_next_free_rec) == 0)
6187                                 goto delete;
6188
6189                         /*
6190                          * We never actually deleted any clusters
6191                          * because our leaf was empty. There's no
6192                          * reason to adjust the rightmost edge then.
6193                          */
6194                         if (new_edge == 0)
6195                                 goto delete;
6196
6197                         rec->e_int_clusters = cpu_to_le32(new_edge);
6198                         le32_add_cpu(&rec->e_int_clusters,
6199                                      -le32_to_cpu(rec->e_cpos));
6200
6201                          /*
6202                           * A deleted child record should have been
6203                           * caught above.
6204                           */
6205                          BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6206                 }
6207
6208 delete:
6209                 ret = ocfs2_journal_dirty(handle, bh);
6210                 if (ret) {
6211                         mlog_errno(ret);
6212                         goto out;
6213                 }
6214
6215                 mlog(0, "extent list container %llu, after: record %d: "
6216                      "(%u, %u, %llu), next = %u.\n",
6217                      (unsigned long long)bh->b_blocknr, i,
6218                      le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6219                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6220                      le16_to_cpu(el->l_next_free_rec));
6221
6222                 /*
6223                  * We must be careful to only attempt delete of an
6224                  * extent block (and not the root inode block).
6225                  */
6226                 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6227                         struct ocfs2_extent_block *eb =
6228                                 (struct ocfs2_extent_block *)bh->b_data;
6229
6230                         /*
6231                          * Save this for use when processing the
6232                          * parent block.
6233                          */
6234                         deleted_eb = le64_to_cpu(eb->h_blkno);
6235
6236                         mlog(0, "deleting this extent block.\n");
6237
6238                         ocfs2_remove_from_cache(inode, bh);
6239
6240                         BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6241                         BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6242                         BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6243
6244                         ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6245                         /* An error here is not fatal. */
6246                         if (ret < 0)
6247                                 mlog_errno(ret);
6248                 } else {
6249                         deleted_eb = 0;
6250                 }
6251
6252                 index--;
6253         }
6254
6255         ret = 0;
6256 out:
6257         return ret;
6258 }
6259
6260 static int ocfs2_do_truncate(struct ocfs2_super *osb,
6261                              unsigned int clusters_to_del,
6262                              struct inode *inode,
6263                              struct buffer_head *fe_bh,
6264                              handle_t *handle,
6265                              struct ocfs2_truncate_context *tc,
6266                              struct ocfs2_path *path)
6267 {
6268         int status;
6269         struct ocfs2_dinode *fe;
6270         struct ocfs2_extent_block *last_eb = NULL;
6271         struct ocfs2_extent_list *el;
6272         struct buffer_head *last_eb_bh = NULL;
6273         u64 delete_blk = 0;
6274
6275         fe = (struct ocfs2_dinode *) fe_bh->b_data;
6276
6277         status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6278                                              path, &last_eb_bh);
6279         if (status < 0) {
6280                 mlog_errno(status);
6281                 goto bail;
6282         }
6283
6284         /*
6285          * Each component will be touched, so we might as well journal
6286          * here to avoid having to handle errors later.
6287          */
6288         status = ocfs2_journal_access_path(inode, handle, path);
6289         if (status < 0) {
6290                 mlog_errno(status);
6291                 goto bail;
6292         }
6293
6294         if (last_eb_bh) {
6295                 status = ocfs2_journal_access(handle, inode, last_eb_bh,
6296                                               OCFS2_JOURNAL_ACCESS_WRITE);
6297                 if (status < 0) {
6298                         mlog_errno(status);
6299                         goto bail;
6300                 }
6301
6302                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6303         }
6304
6305         el = &(fe->id2.i_list);
6306
6307         /*
6308          * Lower levels depend on this never happening, but it's best
6309          * to check it up here before changing the tree.
6310          */
6311         if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6312                 ocfs2_error(inode->i_sb,
6313                             "Inode %lu has an empty extent record, depth %u\n",
6314                             inode->i_ino, le16_to_cpu(el->l_tree_depth));
6315                 status = -EROFS;
6316                 goto bail;
6317         }
6318
6319         spin_lock(&OCFS2_I(inode)->ip_lock);
6320         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6321                                       clusters_to_del;
6322         spin_unlock(&OCFS2_I(inode)->ip_lock);
6323         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6324         inode->i_blocks = ocfs2_inode_sector_count(inode);
6325
6326         status = ocfs2_trim_tree(inode, path, handle, tc,
6327                                  clusters_to_del, &delete_blk);
6328         if (status) {
6329                 mlog_errno(status);
6330                 goto bail;
6331         }
6332
6333         if (le32_to_cpu(fe->i_clusters) == 0) {
6334                 /* trunc to zero is a special case. */
6335                 el->l_tree_depth = 0;
6336                 fe->i_last_eb_blk = 0;
6337         } else if (last_eb)
6338                 fe->i_last_eb_blk = last_eb->h_blkno;
6339
6340         status = ocfs2_journal_dirty(handle, fe_bh);
6341         if (status < 0) {
6342                 mlog_errno(status);
6343                 goto bail;
6344         }
6345
6346         if (last_eb) {
6347                 /* If there will be a new last extent block, then by
6348                  * definition, there cannot be any leaves to the right of
6349                  * him. */
6350                 last_eb->h_next_leaf_blk = 0;
6351                 status = ocfs2_journal_dirty(handle, last_eb_bh);
6352                 if (status < 0) {
6353                         mlog_errno(status);
6354                         goto bail;
6355                 }
6356         }
6357
6358         if (delete_blk) {
6359                 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
6360                                                    clusters_to_del);
6361                 if (status < 0) {
6362                         mlog_errno(status);
6363                         goto bail;
6364                 }
6365         }
6366         status = 0;
6367 bail:
6368
6369         mlog_exit(status);
6370         return status;
6371 }
6372
6373 static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
6374 {
6375         set_buffer_uptodate(bh);
6376         mark_buffer_dirty(bh);
6377         return 0;
6378 }
6379
6380 static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
6381 {
6382         set_buffer_uptodate(bh);
6383         mark_buffer_dirty(bh);
6384         return ocfs2_journal_dirty_data(handle, bh);
6385 }
6386
6387 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6388                                      unsigned int from, unsigned int to,
6389                                      struct page *page, int zero, u64 *phys)
6390 {
6391         int ret, partial = 0;
6392
6393         ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6394         if (ret)
6395                 mlog_errno(ret);
6396
6397         if (zero)
6398                 zero_user_segment(page, from, to);
6399
6400         /*
6401          * Need to set the buffers we zero'd into uptodate
6402          * here if they aren't - ocfs2_map_page_blocks()
6403          * might've skipped some
6404          */
6405         if (ocfs2_should_order_data(inode)) {
6406                 ret = walk_page_buffers(handle,
6407                                         page_buffers(page),
6408                                         from, to, &partial,
6409                                         ocfs2_ordered_zero_func);
6410                 if (ret < 0)
6411                         mlog_errno(ret);
6412         } else {
6413                 ret = walk_page_buffers(handle, page_buffers(page),
6414                                         from, to, &partial,
6415                                         ocfs2_writeback_zero_func);
6416                 if (ret < 0)
6417                         mlog_errno(ret);
6418         }
6419
6420         if (!partial)
6421                 SetPageUptodate(page);
6422
6423         flush_dcache_page(page);
6424 }
6425
6426 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6427                                      loff_t end, struct page **pages,
6428                                      int numpages, u64 phys, handle_t *handle)
6429 {
6430         int i;
6431         struct page *page;
6432         unsigned int from, to = PAGE_CACHE_SIZE;
6433         struct super_block *sb = inode->i_sb;
6434
6435         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6436
6437         if (numpages == 0)
6438                 goto out;
6439
6440         to = PAGE_CACHE_SIZE;
6441         for(i = 0; i < numpages; i++) {
6442                 page = pages[i];
6443
6444                 from = start & (PAGE_CACHE_SIZE - 1);
6445                 if ((end >> PAGE_CACHE_SHIFT) == page->index)
6446                         to = end & (PAGE_CACHE_SIZE - 1);
6447
6448                 BUG_ON(from > PAGE_CACHE_SIZE);
6449                 BUG_ON(to > PAGE_CACHE_SIZE);
6450
6451                 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6452                                          &phys);
6453
6454                 start = (page->index + 1) << PAGE_CACHE_SHIFT;
6455         }
6456 out:
6457         if (pages)
6458                 ocfs2_unlock_and_free_pages(pages, numpages);
6459 }
6460
6461 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6462                                 struct page **pages, int *num)
6463 {
6464         int numpages, ret = 0;
6465         struct super_block *sb = inode->i_sb;
6466         struct address_space *mapping = inode->i_mapping;
6467         unsigned long index;
6468         loff_t last_page_bytes;
6469
6470         BUG_ON(start > end);
6471
6472         BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6473                (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6474
6475         numpages = 0;
6476         last_page_bytes = PAGE_ALIGN(end);
6477         index = start >> PAGE_CACHE_SHIFT;
6478         do {
6479                 pages[numpages] = grab_cache_page(mapping, index);
6480                 if (!pages[numpages]) {
6481                         ret = -ENOMEM;
6482                         mlog_errno(ret);
6483                         goto out;
6484                 }
6485
6486                 numpages++;
6487                 index++;
6488         } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6489
6490 out:
6491         if (ret != 0) {
6492                 if (pages)
6493                         ocfs2_unlock_and_free_pages(pages, numpages);
6494                 numpages = 0;
6495         }
6496
6497         *num = numpages;
6498
6499         return ret;
6500 }
6501
6502 /*
6503  * Zero the area past i_size but still within an allocated
6504  * cluster. This avoids exposing nonzero data on subsequent file
6505  * extends.
6506  *
6507  * We need to call this before i_size is updated on the inode because
6508  * otherwise block_write_full_page() will skip writeout of pages past
6509  * i_size. The new_i_size parameter is passed for this reason.
6510  */
6511 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6512                                   u64 range_start, u64 range_end)
6513 {
6514         int ret = 0, numpages;
6515         struct page **pages = NULL;
6516         u64 phys;
6517         unsigned int ext_flags;
6518         struct super_block *sb = inode->i_sb;
6519
6520         /*
6521          * File systems which don't support sparse files zero on every
6522          * extend.
6523          */
6524         if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6525                 return 0;
6526
6527         pages = kcalloc(ocfs2_pages_per_cluster(sb),
6528                         sizeof(struct page *), GFP_NOFS);
6529         if (pages == NULL) {
6530                 ret = -ENOMEM;
6531                 mlog_errno(ret);
6532                 goto out;
6533         }
6534
6535         if (range_start == range_end)
6536                 goto out;
6537
6538         ret = ocfs2_extent_map_get_blocks(inode,
6539                                           range_start >> sb->s_blocksize_bits,
6540                                           &phys, NULL, &ext_flags);
6541         if (ret) {
6542                 mlog_errno(ret);
6543                 goto out;
6544         }
6545
6546         /*
6547          * Tail is a hole, or is marked unwritten. In either case, we
6548          * can count on read and write to return/push zero's.
6549          */
6550         if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6551                 goto out;
6552
6553         ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6554                                    &numpages);
6555         if (ret) {
6556                 mlog_errno(ret);
6557                 goto out;
6558         }
6559
6560         ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6561                                  numpages, phys, handle);
6562
6563         /*
6564          * Initiate writeout of the pages we zero'd here. We don't
6565          * wait on them - the truncate_inode_pages() call later will
6566          * do that for us.
6567          */
6568         ret = do_sync_mapping_range(inode->i_mapping, range_start,
6569                                     range_end - 1, SYNC_FILE_RANGE_WRITE);
6570         if (ret)
6571                 mlog_errno(ret);
6572
6573 out:
6574         if (pages)
6575                 kfree(pages);
6576
6577         return ret;
6578 }
6579
6580 static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di)
6581 {
6582         unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6583
6584         memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2));
6585 }
6586
6587 void ocfs2_dinode_new_extent_list(struct inode *inode,
6588                                   struct ocfs2_dinode *di)
6589 {
6590         ocfs2_zero_dinode_id2(inode, di);
6591         di->id2.i_list.l_tree_depth = 0;
6592         di->id2.i_list.l_next_free_rec = 0;
6593         di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb));
6594 }
6595
6596 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6597 {
6598         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6599         struct ocfs2_inline_data *idata = &di->id2.i_data;
6600
6601         spin_lock(&oi->ip_lock);
6602         oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6603         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6604         spin_unlock(&oi->ip_lock);
6605
6606         /*
6607          * We clear the entire i_data structure here so that all
6608          * fields can be properly initialized.
6609          */
6610         ocfs2_zero_dinode_id2(inode, di);
6611
6612         idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb));
6613 }
6614
6615 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6616                                          struct buffer_head *di_bh)
6617 {
6618         int ret, i, has_data, num_pages = 0;
6619         handle_t *handle;
6620         u64 uninitialized_var(block);
6621         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6622         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6623         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6624         struct ocfs2_alloc_context *data_ac = NULL;
6625         struct page **pages = NULL;
6626         loff_t end = osb->s_clustersize;
6627
6628         has_data = i_size_read(inode) ? 1 : 0;
6629
6630         if (has_data) {
6631                 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6632                                 sizeof(struct page *), GFP_NOFS);
6633                 if (pages == NULL) {
6634                         ret = -ENOMEM;
6635                         mlog_errno(ret);
6636                         goto out;
6637                 }
6638
6639                 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6640                 if (ret) {
6641                         mlog_errno(ret);
6642                         goto out;
6643                 }
6644         }
6645
6646         handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
6647         if (IS_ERR(handle)) {
6648                 ret = PTR_ERR(handle);
6649                 mlog_errno(ret);
6650                 goto out_unlock;
6651         }
6652
6653         ret = ocfs2_journal_access(handle, inode, di_bh,
6654                                    OCFS2_JOURNAL_ACCESS_WRITE);
6655         if (ret) {
6656                 mlog_errno(ret);
6657                 goto out_commit;
6658         }
6659
6660         if (has_data) {
6661                 u32 bit_off, num;
6662                 unsigned int page_end;
6663                 u64 phys;
6664
6665                 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6666                                            &num);
6667                 if (ret) {
6668                         mlog_errno(ret);
6669                         goto out_commit;
6670                 }
6671
6672                 /*
6673                  * Save two copies, one for insert, and one that can
6674                  * be changed by ocfs2_map_and_dirty_page() below.
6675                  */
6676                 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6677
6678                 /*
6679                  * Non sparse file systems zero on extend, so no need
6680                  * to do that now.
6681                  */
6682                 if (!ocfs2_sparse_alloc(osb) &&
6683                     PAGE_CACHE_SIZE < osb->s_clustersize)
6684                         end = PAGE_CACHE_SIZE;
6685
6686                 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6687                 if (ret) {
6688                         mlog_errno(ret);
6689                         goto out_commit;
6690                 }
6691
6692                 /*
6693                  * This should populate the 1st page for us and mark
6694                  * it up to date.
6695                  */
6696                 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6697                 if (ret) {
6698                         mlog_errno(ret);
6699                         goto out_commit;
6700                 }
6701
6702                 page_end = PAGE_CACHE_SIZE;
6703                 if (PAGE_CACHE_SIZE > osb->s_clustersize)
6704                         page_end = osb->s_clustersize;
6705
6706                 for (i = 0; i < num_pages; i++)
6707                         ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
6708                                                  pages[i], i > 0, &phys);
6709         }
6710
6711         spin_lock(&oi->ip_lock);
6712         oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
6713         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6714         spin_unlock(&oi->ip_lock);
6715
6716         ocfs2_dinode_new_extent_list(inode, di);
6717
6718         ocfs2_journal_dirty(handle, di_bh);
6719
6720         if (has_data) {
6721                 /*
6722                  * An error at this point should be extremely rare. If
6723                  * this proves to be false, we could always re-build
6724                  * the in-inode data from our pages.
6725                  */
6726                 ret = ocfs2_dinode_insert_extent(osb, handle, inode, di_bh,
6727                                                  0, block, 1, 0, NULL);
6728                 if (ret) {
6729                         mlog_errno(ret);
6730                         goto out_commit;
6731                 }
6732
6733                 inode->i_blocks = ocfs2_inode_sector_count(inode);
6734         }
6735
6736 out_commit:
6737         ocfs2_commit_trans(osb, handle);
6738
6739 out_unlock:
6740         if (data_ac)
6741                 ocfs2_free_alloc_context(data_ac);
6742
6743 out:
6744         if (pages) {
6745                 ocfs2_unlock_and_free_pages(pages, num_pages);
6746                 kfree(pages);
6747         }
6748
6749         return ret;
6750 }
6751
6752 /*
6753  * It is expected, that by the time you call this function,
6754  * inode->i_size and fe->i_size have been adjusted.
6755  *
6756  * WARNING: This will kfree the truncate context
6757  */
6758 int ocfs2_commit_truncate(struct ocfs2_super *osb,
6759                           struct inode *inode,
6760                           struct buffer_head *fe_bh,
6761                           struct ocfs2_truncate_context *tc)
6762 {
6763         int status, i, credits, tl_sem = 0;
6764         u32 clusters_to_del, new_highest_cpos, range;
6765         struct ocfs2_extent_list *el;
6766         handle_t *handle = NULL;
6767         struct inode *tl_inode = osb->osb_tl_inode;
6768         struct ocfs2_path *path = NULL;
6769         struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
6770
6771         mlog_entry_void();
6772
6773         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6774                                                      i_size_read(inode));
6775
6776         path = ocfs2_new_path(fe_bh, &di->id2.i_list);
6777         if (!path) {
6778                 status = -ENOMEM;
6779                 mlog_errno(status);
6780                 goto bail;
6781         }
6782
6783         ocfs2_extent_map_trunc(inode, new_highest_cpos);
6784
6785 start:
6786         /*
6787          * Check that we still have allocation to delete.
6788          */
6789         if (OCFS2_I(inode)->ip_clusters == 0) {
6790                 status = 0;
6791                 goto bail;
6792         }
6793
6794         /*
6795          * Truncate always works against the rightmost tree branch.
6796          */
6797         status = ocfs2_find_path(inode, path, UINT_MAX);
6798         if (status) {
6799                 mlog_errno(status);
6800                 goto bail;
6801         }
6802
6803         mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
6804              OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
6805
6806         /*
6807          * By now, el will point to the extent list on the bottom most
6808          * portion of this tree. Only the tail record is considered in
6809          * each pass.
6810          *
6811          * We handle the following cases, in order:
6812          * - empty extent: delete the remaining branch
6813          * - remove the entire record
6814          * - remove a partial record
6815          * - no record needs to be removed (truncate has completed)
6816          */
6817         el = path_leaf_el(path);
6818         if (le16_to_cpu(el->l_next_free_rec) == 0) {
6819                 ocfs2_error(inode->i_sb,
6820                             "Inode %llu has empty extent block at %llu\n",
6821                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
6822                             (unsigned long long)path_leaf_bh(path)->b_blocknr);
6823                 status = -EROFS;
6824                 goto bail;
6825         }
6826
6827         i = le16_to_cpu(el->l_next_free_rec) - 1;
6828         range = le32_to_cpu(el->l_recs[i].e_cpos) +
6829                 ocfs2_rec_clusters(el, &el->l_recs[i]);
6830         if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
6831                 clusters_to_del = 0;
6832         } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
6833                 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
6834         } else if (range > new_highest_cpos) {
6835                 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
6836                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
6837                                   new_highest_cpos;
6838         } else {
6839                 status = 0;
6840                 goto bail;
6841         }
6842
6843         mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
6844              clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
6845
6846         mutex_lock(&tl_inode->i_mutex);
6847         tl_sem = 1;
6848         /* ocfs2_truncate_log_needs_flush guarantees us at least one
6849          * record is free for use. If there isn't any, we flush to get
6850          * an empty truncate log.  */
6851         if (ocfs2_truncate_log_needs_flush(osb)) {
6852                 status = __ocfs2_flush_truncate_log(osb);
6853                 if (status < 0) {
6854                         mlog_errno(status);
6855                         goto bail;
6856                 }
6857         }
6858
6859         credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
6860                                                 (struct ocfs2_dinode *)fe_bh->b_data,
6861                                                 el);
6862         handle = ocfs2_start_trans(osb, credits);
6863         if (IS_ERR(handle)) {
6864                 status = PTR_ERR(handle);
6865                 handle = NULL;
6866                 mlog_errno(status);
6867                 goto bail;
6868         }
6869
6870         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
6871                                    tc, path);
6872         if (status < 0) {
6873                 mlog_errno(status);
6874                 goto bail;
6875         }
6876
6877         mutex_unlock(&tl_inode->i_mutex);
6878         tl_sem = 0;
6879
6880         ocfs2_commit_trans(osb, handle);
6881         handle = NULL;
6882
6883         ocfs2_reinit_path(path, 1);
6884
6885         /*
6886          * The check above will catch the case where we've truncated
6887          * away all allocation.
6888          */
6889         goto start;
6890
6891 bail:
6892
6893         ocfs2_schedule_truncate_log_flush(osb, 1);
6894
6895         if (tl_sem)
6896                 mutex_unlock(&tl_inode->i_mutex);
6897
6898         if (handle)
6899                 ocfs2_commit_trans(osb, handle);
6900
6901         ocfs2_run_deallocs(osb, &tc->tc_dealloc);
6902
6903         ocfs2_free_path(path);
6904
6905         /* This will drop the ext_alloc cluster lock for us */
6906         ocfs2_free_truncate_context(tc);
6907
6908         mlog_exit(status);
6909         return status;
6910 }
6911
6912 /*
6913  * Expects the inode to already be locked.
6914  */
6915 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
6916                            struct inode *inode,
6917                            struct buffer_head *fe_bh,
6918                            struct ocfs2_truncate_context **tc)
6919 {
6920         int status;
6921         unsigned int new_i_clusters;
6922         struct ocfs2_dinode *fe;
6923         struct ocfs2_extent_block *eb;
6924         struct buffer_head *last_eb_bh = NULL;
6925
6926         mlog_entry_void();
6927
6928         *tc = NULL;
6929
6930         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
6931                                                   i_size_read(inode));
6932         fe = (struct ocfs2_dinode *) fe_bh->b_data;
6933
6934         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
6935              "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
6936              (unsigned long long)le64_to_cpu(fe->i_size));
6937
6938         *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
6939         if (!(*tc)) {
6940                 status = -ENOMEM;
6941                 mlog_errno(status);
6942                 goto bail;
6943         }
6944         ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
6945
6946         if (fe->id2.i_list.l_tree_depth) {
6947                 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
6948                                           &last_eb_bh, OCFS2_BH_CACHED, inode);
6949                 if (status < 0) {
6950                         mlog_errno(status);
6951                         goto bail;
6952                 }
6953                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6954                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6955                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6956
6957                         brelse(last_eb_bh);
6958                         status = -EIO;
6959                         goto bail;
6960                 }
6961         }
6962
6963         (*tc)->tc_last_eb_bh = last_eb_bh;
6964
6965         status = 0;
6966 bail:
6967         if (status < 0) {
6968                 if (*tc)
6969                         ocfs2_free_truncate_context(*tc);
6970                 *tc = NULL;
6971         }
6972         mlog_exit_void();
6973         return status;
6974 }
6975
6976 /*
6977  * 'start' is inclusive, 'end' is not.
6978  */
6979 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
6980                           unsigned int start, unsigned int end, int trunc)
6981 {
6982         int ret;
6983         unsigned int numbytes;
6984         handle_t *handle;
6985         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6986         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6987         struct ocfs2_inline_data *idata = &di->id2.i_data;
6988
6989         if (end > i_size_read(inode))
6990                 end = i_size_read(inode);
6991
6992         BUG_ON(start >= end);
6993
6994         if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
6995             !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
6996             !ocfs2_supports_inline_data(osb)) {
6997                 ocfs2_error(inode->i_sb,
6998                             "Inline data flags for inode %llu don't agree! "
6999                             "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7000                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7001                             le16_to_cpu(di->i_dyn_features),
7002                             OCFS2_I(inode)->ip_dyn_features,
7003                             osb->s_feature_incompat);
7004                 ret = -EROFS;
7005                 goto out;
7006         }
7007
7008         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7009         if (IS_ERR(handle)) {
7010                 ret = PTR_ERR(handle);
7011                 mlog_errno(ret);
7012                 goto out;
7013         }
7014
7015         ret = ocfs2_journal_access(handle, inode, di_bh,
7016                                    OCFS2_JOURNAL_ACCESS_WRITE);
7017         if (ret) {
7018                 mlog_errno(ret);
7019                 goto out_commit;
7020         }
7021
7022         numbytes = end - start;
7023         memset(idata->id_data + start, 0, numbytes);
7024
7025         /*
7026          * No need to worry about the data page here - it's been
7027          * truncated already and inline data doesn't need it for
7028          * pushing zero's to disk, so we'll let readpage pick it up
7029          * later.
7030          */
7031         if (trunc) {
7032                 i_size_write(inode, start);
7033                 di->i_size = cpu_to_le64(start);
7034         }
7035
7036         inode->i_blocks = ocfs2_inode_sector_count(inode);
7037         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7038
7039         di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7040         di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7041
7042         ocfs2_journal_dirty(handle, di_bh);
7043
7044 out_commit:
7045         ocfs2_commit_trans(osb, handle);
7046
7047 out:
7048         return ret;
7049 }
7050
7051 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7052 {
7053         /*
7054          * The caller is responsible for completing deallocation
7055          * before freeing the context.
7056          */
7057         if (tc->tc_dealloc.c_first_suballocator != NULL)
7058                 mlog(ML_NOTICE,
7059                      "Truncate completion has non-empty dealloc context\n");
7060
7061         if (tc->tc_last_eb_bh)
7062                 brelse(tc->tc_last_eb_bh);
7063
7064         kfree(tc);
7065 }