WorkStruct: make allyesconfig
[safe/jmp/linux-2.6] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30
31 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
32 #include <cluster/masklog.h>
33
34 #include "ocfs2.h"
35
36 #include "alloc.h"
37 #include "dlmglue.h"
38 #include "extent_map.h"
39 #include "inode.h"
40 #include "journal.h"
41 #include "localalloc.h"
42 #include "suballoc.h"
43 #include "sysfile.h"
44 #include "file.h"
45 #include "super.h"
46 #include "uptodate.h"
47
48 #include "buffer_head_io.h"
49
50 static int ocfs2_extent_contig(struct inode *inode,
51                                struct ocfs2_extent_rec *ext,
52                                u64 blkno);
53
54 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
55                                      struct ocfs2_journal_handle *handle,
56                                      struct inode *inode,
57                                      int wanted,
58                                      struct ocfs2_alloc_context *meta_ac,
59                                      struct buffer_head *bhs[]);
60
61 static int ocfs2_add_branch(struct ocfs2_super *osb,
62                             struct ocfs2_journal_handle *handle,
63                             struct inode *inode,
64                             struct buffer_head *fe_bh,
65                             struct buffer_head *eb_bh,
66                             struct buffer_head *last_eb_bh,
67                             struct ocfs2_alloc_context *meta_ac);
68
69 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
70                                   struct ocfs2_journal_handle *handle,
71                                   struct inode *inode,
72                                   struct buffer_head *fe_bh,
73                                   struct ocfs2_alloc_context *meta_ac,
74                                   struct buffer_head **ret_new_eb_bh);
75
76 static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
77                                   struct ocfs2_journal_handle *handle,
78                                   struct inode *inode,
79                                   struct buffer_head *fe_bh,
80                                   u64 blkno,
81                                   u32 new_clusters);
82
83 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
84                                     struct inode *inode,
85                                     struct buffer_head *fe_bh,
86                                     struct buffer_head **target_bh);
87
88 static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
89                                        struct inode *inode,
90                                        struct ocfs2_dinode *fe,
91                                        unsigned int new_i_clusters,
92                                        struct buffer_head *old_last_eb,
93                                        struct buffer_head **new_last_eb);
94
95 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
96
97 static int ocfs2_extent_contig(struct inode *inode,
98                                struct ocfs2_extent_rec *ext,
99                                u64 blkno)
100 {
101         return blkno == (le64_to_cpu(ext->e_blkno) +
102                          ocfs2_clusters_to_blocks(inode->i_sb,
103                                                   le32_to_cpu(ext->e_clusters)));
104 }
105
106 /*
107  * How many free extents have we got before we need more meta data?
108  */
109 int ocfs2_num_free_extents(struct ocfs2_super *osb,
110                            struct inode *inode,
111                            struct ocfs2_dinode *fe)
112 {
113         int retval;
114         struct ocfs2_extent_list *el;
115         struct ocfs2_extent_block *eb;
116         struct buffer_head *eb_bh = NULL;
117
118         mlog_entry_void();
119
120         if (!OCFS2_IS_VALID_DINODE(fe)) {
121                 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
122                 retval = -EIO;
123                 goto bail;
124         }
125
126         if (fe->i_last_eb_blk) {
127                 retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
128                                           &eb_bh, OCFS2_BH_CACHED, inode);
129                 if (retval < 0) {
130                         mlog_errno(retval);
131                         goto bail;
132                 }
133                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
134                 el = &eb->h_list;
135         } else
136                 el = &fe->id2.i_list;
137
138         BUG_ON(el->l_tree_depth != 0);
139
140         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
141 bail:
142         if (eb_bh)
143                 brelse(eb_bh);
144
145         mlog_exit(retval);
146         return retval;
147 }
148
149 /* expects array to already be allocated
150  *
151  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
152  * l_count for you
153  */
154 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
155                                      struct ocfs2_journal_handle *handle,
156                                      struct inode *inode,
157                                      int wanted,
158                                      struct ocfs2_alloc_context *meta_ac,
159                                      struct buffer_head *bhs[])
160 {
161         int count, status, i;
162         u16 suballoc_bit_start;
163         u32 num_got;
164         u64 first_blkno;
165         struct ocfs2_extent_block *eb;
166
167         mlog_entry_void();
168
169         count = 0;
170         while (count < wanted) {
171                 status = ocfs2_claim_metadata(osb,
172                                               handle,
173                                               meta_ac,
174                                               wanted - count,
175                                               &suballoc_bit_start,
176                                               &num_got,
177                                               &first_blkno);
178                 if (status < 0) {
179                         mlog_errno(status);
180                         goto bail;
181                 }
182
183                 for(i = count;  i < (num_got + count); i++) {
184                         bhs[i] = sb_getblk(osb->sb, first_blkno);
185                         if (bhs[i] == NULL) {
186                                 status = -EIO;
187                                 mlog_errno(status);
188                                 goto bail;
189                         }
190                         ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
191
192                         status = ocfs2_journal_access(handle, inode, bhs[i],
193                                                       OCFS2_JOURNAL_ACCESS_CREATE);
194                         if (status < 0) {
195                                 mlog_errno(status);
196                                 goto bail;
197                         }
198
199                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
200                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
201                         /* Ok, setup the minimal stuff here. */
202                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
203                         eb->h_blkno = cpu_to_le64(first_blkno);
204                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
205
206 #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
207                         /* we always use slot zero's suballocator */
208                         eb->h_suballoc_slot = 0;
209 #else
210                         eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
211 #endif
212                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
213                         eb->h_list.l_count =
214                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
215
216                         suballoc_bit_start++;
217                         first_blkno++;
218
219                         /* We'll also be dirtied by the caller, so
220                          * this isn't absolutely necessary. */
221                         status = ocfs2_journal_dirty(handle, bhs[i]);
222                         if (status < 0) {
223                                 mlog_errno(status);
224                                 goto bail;
225                         }
226                 }
227
228                 count += num_got;
229         }
230
231         status = 0;
232 bail:
233         if (status < 0) {
234                 for(i = 0; i < wanted; i++) {
235                         if (bhs[i])
236                                 brelse(bhs[i]);
237                         bhs[i] = NULL;
238                 }
239         }
240         mlog_exit(status);
241         return status;
242 }
243
244 /*
245  * Add an entire tree branch to our inode. eb_bh is the extent block
246  * to start at, if we don't want to start the branch at the dinode
247  * structure.
248  *
249  * last_eb_bh is required as we have to update it's next_leaf pointer
250  * for the new last extent block.
251  *
252  * the new branch will be 'empty' in the sense that every block will
253  * contain a single record with e_clusters == 0.
254  */
255 static int ocfs2_add_branch(struct ocfs2_super *osb,
256                             struct ocfs2_journal_handle *handle,
257                             struct inode *inode,
258                             struct buffer_head *fe_bh,
259                             struct buffer_head *eb_bh,
260                             struct buffer_head *last_eb_bh,
261                             struct ocfs2_alloc_context *meta_ac)
262 {
263         int status, new_blocks, i;
264         u64 next_blkno, new_last_eb_blk;
265         struct buffer_head *bh;
266         struct buffer_head **new_eb_bhs = NULL;
267         struct ocfs2_dinode *fe;
268         struct ocfs2_extent_block *eb;
269         struct ocfs2_extent_list  *eb_el;
270         struct ocfs2_extent_list  *el;
271
272         mlog_entry_void();
273
274         BUG_ON(!last_eb_bh);
275
276         fe = (struct ocfs2_dinode *) fe_bh->b_data;
277
278         if (eb_bh) {
279                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
280                 el = &eb->h_list;
281         } else
282                 el = &fe->id2.i_list;
283
284         /* we never add a branch to a leaf. */
285         BUG_ON(!el->l_tree_depth);
286
287         new_blocks = le16_to_cpu(el->l_tree_depth);
288
289         /* allocate the number of new eb blocks we need */
290         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
291                              GFP_KERNEL);
292         if (!new_eb_bhs) {
293                 status = -ENOMEM;
294                 mlog_errno(status);
295                 goto bail;
296         }
297
298         status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
299                                            meta_ac, new_eb_bhs);
300         if (status < 0) {
301                 mlog_errno(status);
302                 goto bail;
303         }
304
305         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
306          * linked with the rest of the tree.
307          * conversly, new_eb_bhs[0] is the new bottommost leaf.
308          *
309          * when we leave the loop, new_last_eb_blk will point to the
310          * newest leaf, and next_blkno will point to the topmost extent
311          * block. */
312         next_blkno = new_last_eb_blk = 0;
313         for(i = 0; i < new_blocks; i++) {
314                 bh = new_eb_bhs[i];
315                 eb = (struct ocfs2_extent_block *) bh->b_data;
316                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
317                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
318                         status = -EIO;
319                         goto bail;
320                 }
321                 eb_el = &eb->h_list;
322
323                 status = ocfs2_journal_access(handle, inode, bh,
324                                               OCFS2_JOURNAL_ACCESS_CREATE);
325                 if (status < 0) {
326                         mlog_errno(status);
327                         goto bail;
328                 }
329
330                 eb->h_next_leaf_blk = 0;
331                 eb_el->l_tree_depth = cpu_to_le16(i);
332                 eb_el->l_next_free_rec = cpu_to_le16(1);
333                 eb_el->l_recs[0].e_cpos = fe->i_clusters;
334                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
335                 eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
336                 if (!eb_el->l_tree_depth)
337                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
338
339                 status = ocfs2_journal_dirty(handle, bh);
340                 if (status < 0) {
341                         mlog_errno(status);
342                         goto bail;
343                 }
344
345                 next_blkno = le64_to_cpu(eb->h_blkno);
346         }
347
348         /* This is a bit hairy. We want to update up to three blocks
349          * here without leaving any of them in an inconsistent state
350          * in case of error. We don't have to worry about
351          * journal_dirty erroring as it won't unless we've aborted the
352          * handle (in which case we would never be here) so reserving
353          * the write with journal_access is all we need to do. */
354         status = ocfs2_journal_access(handle, inode, last_eb_bh,
355                                       OCFS2_JOURNAL_ACCESS_WRITE);
356         if (status < 0) {
357                 mlog_errno(status);
358                 goto bail;
359         }
360         status = ocfs2_journal_access(handle, inode, fe_bh,
361                                       OCFS2_JOURNAL_ACCESS_WRITE);
362         if (status < 0) {
363                 mlog_errno(status);
364                 goto bail;
365         }
366         if (eb_bh) {
367                 status = ocfs2_journal_access(handle, inode, eb_bh,
368                                               OCFS2_JOURNAL_ACCESS_WRITE);
369                 if (status < 0) {
370                         mlog_errno(status);
371                         goto bail;
372                 }
373         }
374
375         /* Link the new branch into the rest of the tree (el will
376          * either be on the fe, or the extent block passed in. */
377         i = le16_to_cpu(el->l_next_free_rec);
378         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
379         el->l_recs[i].e_cpos = fe->i_clusters;
380         el->l_recs[i].e_clusters = 0;
381         le16_add_cpu(&el->l_next_free_rec, 1);
382
383         /* fe needs a new last extent block pointer, as does the
384          * next_leaf on the previously last-extent-block. */
385         fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
386
387         eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
388         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
389
390         status = ocfs2_journal_dirty(handle, last_eb_bh);
391         if (status < 0)
392                 mlog_errno(status);
393         status = ocfs2_journal_dirty(handle, fe_bh);
394         if (status < 0)
395                 mlog_errno(status);
396         if (eb_bh) {
397                 status = ocfs2_journal_dirty(handle, eb_bh);
398                 if (status < 0)
399                         mlog_errno(status);
400         }
401
402         status = 0;
403 bail:
404         if (new_eb_bhs) {
405                 for (i = 0; i < new_blocks; i++)
406                         if (new_eb_bhs[i])
407                                 brelse(new_eb_bhs[i]);
408                 kfree(new_eb_bhs);
409         }
410
411         mlog_exit(status);
412         return status;
413 }
414
415 /*
416  * adds another level to the allocation tree.
417  * returns back the new extent block so you can add a branch to it
418  * after this call.
419  */
420 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
421                                   struct ocfs2_journal_handle *handle,
422                                   struct inode *inode,
423                                   struct buffer_head *fe_bh,
424                                   struct ocfs2_alloc_context *meta_ac,
425                                   struct buffer_head **ret_new_eb_bh)
426 {
427         int status, i;
428         struct buffer_head *new_eb_bh = NULL;
429         struct ocfs2_dinode *fe;
430         struct ocfs2_extent_block *eb;
431         struct ocfs2_extent_list  *fe_el;
432         struct ocfs2_extent_list  *eb_el;
433
434         mlog_entry_void();
435
436         status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
437                                            &new_eb_bh);
438         if (status < 0) {
439                 mlog_errno(status);
440                 goto bail;
441         }
442
443         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
444         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
445                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
446                 status = -EIO;
447                 goto bail;
448         }
449
450         eb_el = &eb->h_list;
451         fe = (struct ocfs2_dinode *) fe_bh->b_data;
452         fe_el = &fe->id2.i_list;
453
454         status = ocfs2_journal_access(handle, inode, new_eb_bh,
455                                       OCFS2_JOURNAL_ACCESS_CREATE);
456         if (status < 0) {
457                 mlog_errno(status);
458                 goto bail;
459         }
460
461         /* copy the fe data into the new extent block */
462         eb_el->l_tree_depth = fe_el->l_tree_depth;
463         eb_el->l_next_free_rec = fe_el->l_next_free_rec;
464         for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
465                 eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
466                 eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
467                 eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
468         }
469
470         status = ocfs2_journal_dirty(handle, new_eb_bh);
471         if (status < 0) {
472                 mlog_errno(status);
473                 goto bail;
474         }
475
476         status = ocfs2_journal_access(handle, inode, fe_bh,
477                                       OCFS2_JOURNAL_ACCESS_WRITE);
478         if (status < 0) {
479                 mlog_errno(status);
480                 goto bail;
481         }
482
483         /* update fe now */
484         le16_add_cpu(&fe_el->l_tree_depth, 1);
485         fe_el->l_recs[0].e_cpos = 0;
486         fe_el->l_recs[0].e_blkno = eb->h_blkno;
487         fe_el->l_recs[0].e_clusters = fe->i_clusters;
488         for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
489                 fe_el->l_recs[i].e_cpos = 0;
490                 fe_el->l_recs[i].e_clusters = 0;
491                 fe_el->l_recs[i].e_blkno = 0;
492         }
493         fe_el->l_next_free_rec = cpu_to_le16(1);
494
495         /* If this is our 1st tree depth shift, then last_eb_blk
496          * becomes the allocated extent block */
497         if (fe_el->l_tree_depth == cpu_to_le16(1))
498                 fe->i_last_eb_blk = eb->h_blkno;
499
500         status = ocfs2_journal_dirty(handle, fe_bh);
501         if (status < 0) {
502                 mlog_errno(status);
503                 goto bail;
504         }
505
506         *ret_new_eb_bh = new_eb_bh;
507         new_eb_bh = NULL;
508         status = 0;
509 bail:
510         if (new_eb_bh)
511                 brelse(new_eb_bh);
512
513         mlog_exit(status);
514         return status;
515 }
516
517 /*
518  * Expects the tree to already have room in the rightmost leaf for the
519  * extent.  Updates all the extent blocks (and the dinode) on the way
520  * down.
521  */
522 static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
523                                   struct ocfs2_journal_handle *handle,
524                                   struct inode *inode,
525                                   struct buffer_head *fe_bh,
526                                   u64 start_blk,
527                                   u32 new_clusters)
528 {
529         int status, i, num_bhs = 0;
530         u64 next_blkno;
531         u16 next_free;
532         struct buffer_head **eb_bhs = NULL;
533         struct ocfs2_dinode *fe;
534         struct ocfs2_extent_block *eb;
535         struct ocfs2_extent_list  *el;
536
537         mlog_entry_void();
538
539         status = ocfs2_journal_access(handle, inode, fe_bh,
540                                       OCFS2_JOURNAL_ACCESS_WRITE);
541         if (status < 0) {
542                 mlog_errno(status);
543                 goto bail;
544         }
545
546         fe = (struct ocfs2_dinode *) fe_bh->b_data;
547         el = &fe->id2.i_list;
548         if (el->l_tree_depth) {
549                 /* This is another operation where we want to be
550                  * careful about our tree updates. An error here means
551                  * none of the previous changes we made should roll
552                  * forward. As a result, we have to record the buffers
553                  * for this part of the tree in an array and reserve a
554                  * journal write to them before making any changes. */
555                 num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
556                 eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
557                                  GFP_KERNEL);
558                 if (!eb_bhs) {
559                         status = -ENOMEM;
560                         mlog_errno(status);
561                         goto bail;
562                 }
563
564                 i = 0;
565                 while(el->l_tree_depth) {
566                         next_free = le16_to_cpu(el->l_next_free_rec);
567                         if (next_free == 0) {
568                                 ocfs2_error(inode->i_sb,
569                                             "Dinode %llu has a bad extent list",
570                                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
571                                 status = -EIO;
572                                 goto bail;
573                         }
574                         next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
575
576                         BUG_ON(i >= num_bhs);
577                         status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
578                                                   OCFS2_BH_CACHED, inode);
579                         if (status < 0) {
580                                 mlog_errno(status);
581                                 goto bail;
582                         }
583                         eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
584                         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
585                                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
586                                                                  eb);
587                                 status = -EIO;
588                                 goto bail;
589                         }
590
591                         status = ocfs2_journal_access(handle, inode, eb_bhs[i],
592                                                       OCFS2_JOURNAL_ACCESS_WRITE);
593                         if (status < 0) {
594                                 mlog_errno(status);
595                                 goto bail;
596                         }
597
598                         el = &eb->h_list;
599                         i++;
600                         /* When we leave this loop, eb_bhs[num_bhs - 1] will
601                          * hold the bottom-most leaf extent block. */
602                 }
603                 BUG_ON(el->l_tree_depth);
604
605                 el = &fe->id2.i_list;
606                 /* If we have tree depth, then the fe update is
607                  * trivial, and we want to switch el out for the
608                  * bottom-most leaf in order to update it with the
609                  * actual extent data below. */
610                 next_free = le16_to_cpu(el->l_next_free_rec);
611                 if (next_free == 0) {
612                         ocfs2_error(inode->i_sb,
613                                     "Dinode %llu has a bad extent list",
614                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
615                         status = -EIO;
616                         goto bail;
617                 }
618                 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
619                              new_clusters);
620                 /* (num_bhs - 1) to avoid the leaf */
621                 for(i = 0; i < (num_bhs - 1); i++) {
622                         eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
623                         el = &eb->h_list;
624
625                         /* finally, make our actual change to the
626                          * intermediate extent blocks. */
627                         next_free = le16_to_cpu(el->l_next_free_rec);
628                         le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
629                                      new_clusters);
630
631                         status = ocfs2_journal_dirty(handle, eb_bhs[i]);
632                         if (status < 0)
633                                 mlog_errno(status);
634                 }
635                 BUG_ON(i != (num_bhs - 1));
636                 /* note that the leaf block wasn't touched in
637                  * the loop above */
638                 eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
639                 el = &eb->h_list;
640                 BUG_ON(el->l_tree_depth);
641         }
642
643         /* yay, we can finally add the actual extent now! */
644         i = le16_to_cpu(el->l_next_free_rec) - 1;
645         if (le16_to_cpu(el->l_next_free_rec) &&
646             ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
647                 le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
648         } else if (le16_to_cpu(el->l_next_free_rec) &&
649                    (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
650                 /* having an empty extent at eof is legal. */
651                 if (el->l_recs[i].e_cpos != fe->i_clusters) {
652                         ocfs2_error(inode->i_sb,
653                                     "Dinode %llu trailing extent is bad: "
654                                     "cpos (%u) != number of clusters (%u)",
655                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
656                                     le32_to_cpu(el->l_recs[i].e_cpos),
657                                     le32_to_cpu(fe->i_clusters));
658                         status = -EIO;
659                         goto bail;
660                 }
661                 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
662                 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
663         } else {
664                 /* No contiguous record, or no empty record at eof, so
665                  * we add a new one. */
666
667                 BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
668                        le16_to_cpu(el->l_count));
669                 i = le16_to_cpu(el->l_next_free_rec);
670
671                 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
672                 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
673                 el->l_recs[i].e_cpos = fe->i_clusters;
674                 le16_add_cpu(&el->l_next_free_rec, 1);
675         }
676
677         /*
678          * extent_map errors are not fatal, so they are ignored outside
679          * of flushing the thing.
680          */
681         status = ocfs2_extent_map_append(inode, &el->l_recs[i],
682                                          new_clusters);
683         if (status) {
684                 mlog_errno(status);
685                 ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
686         }
687
688         status = ocfs2_journal_dirty(handle, fe_bh);
689         if (status < 0)
690                 mlog_errno(status);
691         if (fe->id2.i_list.l_tree_depth) {
692                 status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
693                 if (status < 0)
694                         mlog_errno(status);
695         }
696
697         status = 0;
698 bail:
699         if (eb_bhs) {
700                 for (i = 0; i < num_bhs; i++)
701                         if (eb_bhs[i])
702                                 brelse(eb_bhs[i]);
703                 kfree(eb_bhs);
704         }
705
706         mlog_exit(status);
707         return status;
708 }
709
710 /*
711  * Should only be called when there is no space left in any of the
712  * leaf nodes. What we want to do is find the lowest tree depth
713  * non-leaf extent block with room for new records. There are three
714  * valid results of this search:
715  *
716  * 1) a lowest extent block is found, then we pass it back in
717  *    *lowest_eb_bh and return '0'
718  *
719  * 2) the search fails to find anything, but the dinode has room. We
720  *    pass NULL back in *lowest_eb_bh, but still return '0'
721  *
722  * 3) the search fails to find anything AND the dinode is full, in
723  *    which case we return > 0
724  *
725  * return status < 0 indicates an error.
726  */
727 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
728                                     struct inode *inode,
729                                     struct buffer_head *fe_bh,
730                                     struct buffer_head **target_bh)
731 {
732         int status = 0, i;
733         u64 blkno;
734         struct ocfs2_dinode *fe;
735         struct ocfs2_extent_block *eb;
736         struct ocfs2_extent_list  *el;
737         struct buffer_head *bh = NULL;
738         struct buffer_head *lowest_bh = NULL;
739
740         mlog_entry_void();
741
742         *target_bh = NULL;
743
744         fe = (struct ocfs2_dinode *) fe_bh->b_data;
745         el = &fe->id2.i_list;
746
747         while(le16_to_cpu(el->l_tree_depth) > 1) {
748                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
749                         ocfs2_error(inode->i_sb, "Dinode %llu has empty "
750                                     "extent list (next_free_rec == 0)",
751                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
752                         status = -EIO;
753                         goto bail;
754                 }
755                 i = le16_to_cpu(el->l_next_free_rec) - 1;
756                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
757                 if (!blkno) {
758                         ocfs2_error(inode->i_sb, "Dinode %llu has extent "
759                                     "list where extent # %d has no physical "
760                                     "block start",
761                                     (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
762                         status = -EIO;
763                         goto bail;
764                 }
765
766                 if (bh) {
767                         brelse(bh);
768                         bh = NULL;
769                 }
770
771                 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
772                                           inode);
773                 if (status < 0) {
774                         mlog_errno(status);
775                         goto bail;
776                 }
777
778                 eb = (struct ocfs2_extent_block *) bh->b_data;
779                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
780                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
781                         status = -EIO;
782                         goto bail;
783                 }
784                 el = &eb->h_list;
785
786                 if (le16_to_cpu(el->l_next_free_rec) <
787                     le16_to_cpu(el->l_count)) {
788                         if (lowest_bh)
789                                 brelse(lowest_bh);
790                         lowest_bh = bh;
791                         get_bh(lowest_bh);
792                 }
793         }
794
795         /* If we didn't find one and the fe doesn't have any room,
796          * then return '1' */
797         if (!lowest_bh
798             && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
799                 status = 1;
800
801         *target_bh = lowest_bh;
802 bail:
803         if (bh)
804                 brelse(bh);
805
806         mlog_exit(status);
807         return status;
808 }
809
810 /* the caller needs to update fe->i_clusters */
811 int ocfs2_insert_extent(struct ocfs2_super *osb,
812                         struct ocfs2_journal_handle *handle,
813                         struct inode *inode,
814                         struct buffer_head *fe_bh,
815                         u64 start_blk,
816                         u32 new_clusters,
817                         struct ocfs2_alloc_context *meta_ac)
818 {
819         int status, i, shift;
820         struct buffer_head *last_eb_bh = NULL;
821         struct buffer_head *bh = NULL;
822         struct ocfs2_dinode *fe;
823         struct ocfs2_extent_block *eb;
824         struct ocfs2_extent_list  *el;
825
826         mlog_entry_void();
827
828         mlog(0, "add %u clusters starting at block %llu to inode %llu\n",
829              new_clusters, (unsigned long long)start_blk,
830              (unsigned long long)OCFS2_I(inode)->ip_blkno);
831
832         fe = (struct ocfs2_dinode *) fe_bh->b_data;
833         el = &fe->id2.i_list;
834
835         if (el->l_tree_depth) {
836                 /* jump to end of tree */
837                 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
838                                           &last_eb_bh, OCFS2_BH_CACHED, inode);
839                 if (status < 0) {
840                         mlog_exit(status);
841                         goto bail;
842                 }
843                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
844                 el = &eb->h_list;
845         }
846
847         /* Can we allocate without adding/shifting tree bits? */
848         i = le16_to_cpu(el->l_next_free_rec) - 1;
849         if (le16_to_cpu(el->l_next_free_rec) == 0
850             || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
851             || le32_to_cpu(el->l_recs[i].e_clusters) == 0
852             || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
853                 goto out_add;
854
855         mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
856              "tree now.\n");
857
858         shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
859         if (shift < 0) {
860                 status = shift;
861                 mlog_errno(status);
862                 goto bail;
863         }
864
865         /* We traveled all the way to the bottom of the allocation tree
866          * and didn't find room for any more extents - we need to add
867          * another tree level */
868         if (shift) {
869                 /* if we hit a leaf, we'd better be empty :) */
870                 BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
871                        le16_to_cpu(el->l_count));
872                 BUG_ON(bh);
873                 mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
874                      "(current = %u)\n",
875                      le16_to_cpu(fe->id2.i_list.l_tree_depth));
876
877                 /* ocfs2_shift_tree_depth will return us a buffer with
878                  * the new extent block (so we can pass that to
879                  * ocfs2_add_branch). */
880                 status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
881                                                 meta_ac, &bh);
882                 if (status < 0) {
883                         mlog_errno(status);
884                         goto bail;
885                 }
886                 /* Special case: we have room now if we shifted from
887                  * tree_depth 0 */
888                 if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
889                         goto out_add;
890         }
891
892         /* call ocfs2_add_branch to add the final part of the tree with
893          * the new data. */
894         mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
895         status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
896                                   meta_ac);
897         if (status < 0) {
898                 mlog_errno(status);
899                 goto bail;
900         }
901
902 out_add:
903         /* Finally, we can add clusters. */
904         status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
905                                         start_blk, new_clusters);
906         if (status < 0)
907                 mlog_errno(status);
908
909 bail:
910         if (bh)
911                 brelse(bh);
912
913         if (last_eb_bh)
914                 brelse(last_eb_bh);
915
916         mlog_exit(status);
917         return status;
918 }
919
920 static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
921 {
922         struct buffer_head *tl_bh = osb->osb_tl_bh;
923         struct ocfs2_dinode *di;
924         struct ocfs2_truncate_log *tl;
925
926         di = (struct ocfs2_dinode *) tl_bh->b_data;
927         tl = &di->id2.i_dealloc;
928
929         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
930                         "slot %d, invalid truncate log parameters: used = "
931                         "%u, count = %u\n", osb->slot_num,
932                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
933         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
934 }
935
936 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
937                                            unsigned int new_start)
938 {
939         unsigned int tail_index;
940         unsigned int current_tail;
941
942         /* No records, nothing to coalesce */
943         if (!le16_to_cpu(tl->tl_used))
944                 return 0;
945
946         tail_index = le16_to_cpu(tl->tl_used) - 1;
947         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
948         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
949
950         return current_tail == new_start;
951 }
952
953 static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
954                                      struct ocfs2_journal_handle *handle,
955                                      u64 start_blk,
956                                      unsigned int num_clusters)
957 {
958         int status, index;
959         unsigned int start_cluster, tl_count;
960         struct inode *tl_inode = osb->osb_tl_inode;
961         struct buffer_head *tl_bh = osb->osb_tl_bh;
962         struct ocfs2_dinode *di;
963         struct ocfs2_truncate_log *tl;
964
965         mlog_entry("start_blk = %llu, num_clusters = %u\n",
966                    (unsigned long long)start_blk, num_clusters);
967
968         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
969
970         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
971
972         di = (struct ocfs2_dinode *) tl_bh->b_data;
973         tl = &di->id2.i_dealloc;
974         if (!OCFS2_IS_VALID_DINODE(di)) {
975                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
976                 status = -EIO;
977                 goto bail;
978         }
979
980         tl_count = le16_to_cpu(tl->tl_count);
981         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
982                         tl_count == 0,
983                         "Truncate record count on #%llu invalid "
984                         "wanted %u, actual %u\n",
985                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
986                         ocfs2_truncate_recs_per_inode(osb->sb),
987                         le16_to_cpu(tl->tl_count));
988
989         /* Caller should have known to flush before calling us. */
990         index = le16_to_cpu(tl->tl_used);
991         if (index >= tl_count) {
992                 status = -ENOSPC;
993                 mlog_errno(status);
994                 goto bail;
995         }
996
997         status = ocfs2_journal_access(handle, tl_inode, tl_bh,
998                                       OCFS2_JOURNAL_ACCESS_WRITE);
999         if (status < 0) {
1000                 mlog_errno(status);
1001                 goto bail;
1002         }
1003
1004         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
1005              "%llu (index = %d)\n", num_clusters, start_cluster,
1006              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
1007
1008         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
1009                 /*
1010                  * Move index back to the record we are coalescing with.
1011                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
1012                  */
1013                 index--;
1014
1015                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
1016                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
1017                      index, le32_to_cpu(tl->tl_recs[index].t_start),
1018                      num_clusters);
1019         } else {
1020                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
1021                 tl->tl_used = cpu_to_le16(index + 1);
1022         }
1023         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
1024
1025         status = ocfs2_journal_dirty(handle, tl_bh);
1026         if (status < 0) {
1027                 mlog_errno(status);
1028                 goto bail;
1029         }
1030
1031 bail:
1032         mlog_exit(status);
1033         return status;
1034 }
1035
1036 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
1037                                          struct ocfs2_journal_handle *handle,
1038                                          struct inode *data_alloc_inode,
1039                                          struct buffer_head *data_alloc_bh)
1040 {
1041         int status = 0;
1042         int i;
1043         unsigned int num_clusters;
1044         u64 start_blk;
1045         struct ocfs2_truncate_rec rec;
1046         struct ocfs2_dinode *di;
1047         struct ocfs2_truncate_log *tl;
1048         struct inode *tl_inode = osb->osb_tl_inode;
1049         struct buffer_head *tl_bh = osb->osb_tl_bh;
1050
1051         mlog_entry_void();
1052
1053         di = (struct ocfs2_dinode *) tl_bh->b_data;
1054         tl = &di->id2.i_dealloc;
1055         i = le16_to_cpu(tl->tl_used) - 1;
1056         while (i >= 0) {
1057                 /* Caller has given us at least enough credits to
1058                  * update the truncate log dinode */
1059                 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
1060                                               OCFS2_JOURNAL_ACCESS_WRITE);
1061                 if (status < 0) {
1062                         mlog_errno(status);
1063                         goto bail;
1064                 }
1065
1066                 tl->tl_used = cpu_to_le16(i);
1067
1068                 status = ocfs2_journal_dirty(handle, tl_bh);
1069                 if (status < 0) {
1070                         mlog_errno(status);
1071                         goto bail;
1072                 }
1073
1074                 /* TODO: Perhaps we can calculate the bulk of the
1075                  * credits up front rather than extending like
1076                  * this. */
1077                 status = ocfs2_extend_trans(handle,
1078                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
1079                 if (status < 0) {
1080                         mlog_errno(status);
1081                         goto bail;
1082                 }
1083
1084                 rec = tl->tl_recs[i];
1085                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
1086                                                     le32_to_cpu(rec.t_start));
1087                 num_clusters = le32_to_cpu(rec.t_clusters);
1088
1089                 /* if start_blk is not set, we ignore the record as
1090                  * invalid. */
1091                 if (start_blk) {
1092                         mlog(0, "free record %d, start = %u, clusters = %u\n",
1093                              i, le32_to_cpu(rec.t_start), num_clusters);
1094
1095                         status = ocfs2_free_clusters(handle, data_alloc_inode,
1096                                                      data_alloc_bh, start_blk,
1097                                                      num_clusters);
1098                         if (status < 0) {
1099                                 mlog_errno(status);
1100                                 goto bail;
1101                         }
1102                 }
1103                 i--;
1104         }
1105
1106 bail:
1107         mlog_exit(status);
1108         return status;
1109 }
1110
1111 /* Expects you to already be holding tl_inode->i_mutex */
1112 static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
1113 {
1114         int status;
1115         unsigned int num_to_flush;
1116         struct ocfs2_journal_handle *handle = NULL;
1117         struct inode *tl_inode = osb->osb_tl_inode;
1118         struct inode *data_alloc_inode = NULL;
1119         struct buffer_head *tl_bh = osb->osb_tl_bh;
1120         struct buffer_head *data_alloc_bh = NULL;
1121         struct ocfs2_dinode *di;
1122         struct ocfs2_truncate_log *tl;
1123
1124         mlog_entry_void();
1125
1126         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
1127
1128         di = (struct ocfs2_dinode *) tl_bh->b_data;
1129         tl = &di->id2.i_dealloc;
1130         if (!OCFS2_IS_VALID_DINODE(di)) {
1131                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
1132                 status = -EIO;
1133                 goto bail;
1134         }
1135
1136         num_to_flush = le16_to_cpu(tl->tl_used);
1137         mlog(0, "Flush %u records from truncate log #%llu\n",
1138              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
1139         if (!num_to_flush) {
1140                 status = 0;
1141                 goto bail;
1142         }
1143
1144         handle = ocfs2_alloc_handle(osb);
1145         if (!handle) {
1146                 status = -ENOMEM;
1147                 mlog_errno(status);
1148                 goto bail;
1149         }
1150
1151         data_alloc_inode = ocfs2_get_system_file_inode(osb,
1152                                                        GLOBAL_BITMAP_SYSTEM_INODE,
1153                                                        OCFS2_INVALID_SLOT);
1154         if (!data_alloc_inode) {
1155                 status = -EINVAL;
1156                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
1157                 goto bail;
1158         }
1159
1160         ocfs2_handle_add_inode(handle, data_alloc_inode);
1161         status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1);
1162         if (status < 0) {
1163                 mlog_errno(status);
1164                 goto bail;
1165         }
1166
1167         handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE);
1168         if (IS_ERR(handle)) {
1169                 status = PTR_ERR(handle);
1170                 handle = NULL;
1171                 mlog_errno(status);
1172                 goto bail;
1173         }
1174
1175         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
1176                                                data_alloc_bh);
1177         if (status < 0) {
1178                 mlog_errno(status);
1179                 goto bail;
1180         }
1181
1182 bail:
1183         if (handle)
1184                 ocfs2_commit_trans(handle);
1185
1186         if (data_alloc_inode)
1187                 iput(data_alloc_inode);
1188
1189         if (data_alloc_bh)
1190                 brelse(data_alloc_bh);
1191
1192         mlog_exit(status);
1193         return status;
1194 }
1195
1196 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
1197 {
1198         int status;
1199         struct inode *tl_inode = osb->osb_tl_inode;
1200
1201         mutex_lock(&tl_inode->i_mutex);
1202         status = __ocfs2_flush_truncate_log(osb);
1203         mutex_unlock(&tl_inode->i_mutex);
1204
1205         return status;
1206 }
1207
1208 static void ocfs2_truncate_log_worker(struct work_struct *work)
1209 {
1210         int status;
1211         struct ocfs2_super *osb =
1212                 container_of(work, struct ocfs2_super,
1213                              osb_truncate_log_wq.work);
1214
1215         mlog_entry_void();
1216
1217         status = ocfs2_flush_truncate_log(osb);
1218         if (status < 0)
1219                 mlog_errno(status);
1220
1221         mlog_exit(status);
1222 }
1223
1224 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
1225 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
1226                                        int cancel)
1227 {
1228         if (osb->osb_tl_inode) {
1229                 /* We want to push off log flushes while truncates are
1230                  * still running. */
1231                 if (cancel)
1232                         cancel_delayed_work(&osb->osb_truncate_log_wq);
1233
1234                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
1235                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
1236         }
1237 }
1238
1239 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
1240                                        int slot_num,
1241                                        struct inode **tl_inode,
1242                                        struct buffer_head **tl_bh)
1243 {
1244         int status;
1245         struct inode *inode = NULL;
1246         struct buffer_head *bh = NULL;
1247
1248         inode = ocfs2_get_system_file_inode(osb,
1249                                            TRUNCATE_LOG_SYSTEM_INODE,
1250                                            slot_num);
1251         if (!inode) {
1252                 status = -EINVAL;
1253                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
1254                 goto bail;
1255         }
1256
1257         status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
1258                                   OCFS2_BH_CACHED, inode);
1259         if (status < 0) {
1260                 iput(inode);
1261                 mlog_errno(status);
1262                 goto bail;
1263         }
1264
1265         *tl_inode = inode;
1266         *tl_bh    = bh;
1267 bail:
1268         mlog_exit(status);
1269         return status;
1270 }
1271
1272 /* called during the 1st stage of node recovery. we stamp a clean
1273  * truncate log and pass back a copy for processing later. if the
1274  * truncate log does not require processing, a *tl_copy is set to
1275  * NULL. */
1276 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
1277                                       int slot_num,
1278                                       struct ocfs2_dinode **tl_copy)
1279 {
1280         int status;
1281         struct inode *tl_inode = NULL;
1282         struct buffer_head *tl_bh = NULL;
1283         struct ocfs2_dinode *di;
1284         struct ocfs2_truncate_log *tl;
1285
1286         *tl_copy = NULL;
1287
1288         mlog(0, "recover truncate log from slot %d\n", slot_num);
1289
1290         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
1291         if (status < 0) {
1292                 mlog_errno(status);
1293                 goto bail;
1294         }
1295
1296         di = (struct ocfs2_dinode *) tl_bh->b_data;
1297         tl = &di->id2.i_dealloc;
1298         if (!OCFS2_IS_VALID_DINODE(di)) {
1299                 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
1300                 status = -EIO;
1301                 goto bail;
1302         }
1303
1304         if (le16_to_cpu(tl->tl_used)) {
1305                 mlog(0, "We'll have %u logs to recover\n",
1306                      le16_to_cpu(tl->tl_used));
1307
1308                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
1309                 if (!(*tl_copy)) {
1310                         status = -ENOMEM;
1311                         mlog_errno(status);
1312                         goto bail;
1313                 }
1314
1315                 /* Assuming the write-out below goes well, this copy
1316                  * will be passed back to recovery for processing. */
1317                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
1318
1319                 /* All we need to do to clear the truncate log is set
1320                  * tl_used. */
1321                 tl->tl_used = 0;
1322
1323                 status = ocfs2_write_block(osb, tl_bh, tl_inode);
1324                 if (status < 0) {
1325                         mlog_errno(status);
1326                         goto bail;
1327                 }
1328         }
1329
1330 bail:
1331         if (tl_inode)
1332                 iput(tl_inode);
1333         if (tl_bh)
1334                 brelse(tl_bh);
1335
1336         if (status < 0 && (*tl_copy)) {
1337                 kfree(*tl_copy);
1338                 *tl_copy = NULL;
1339         }
1340
1341         mlog_exit(status);
1342         return status;
1343 }
1344
1345 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
1346                                          struct ocfs2_dinode *tl_copy)
1347 {
1348         int status = 0;
1349         int i;
1350         unsigned int clusters, num_recs, start_cluster;
1351         u64 start_blk;
1352         struct ocfs2_journal_handle *handle;
1353         struct inode *tl_inode = osb->osb_tl_inode;
1354         struct ocfs2_truncate_log *tl;
1355
1356         mlog_entry_void();
1357
1358         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
1359                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
1360                 return -EINVAL;
1361         }
1362
1363         tl = &tl_copy->id2.i_dealloc;
1364         num_recs = le16_to_cpu(tl->tl_used);
1365         mlog(0, "cleanup %u records from %llu\n", num_recs,
1366              (unsigned long long)tl_copy->i_blkno);
1367
1368         mutex_lock(&tl_inode->i_mutex);
1369         for(i = 0; i < num_recs; i++) {
1370                 if (ocfs2_truncate_log_needs_flush(osb)) {
1371                         status = __ocfs2_flush_truncate_log(osb);
1372                         if (status < 0) {
1373                                 mlog_errno(status);
1374                                 goto bail_up;
1375                         }
1376                 }
1377
1378                 handle = ocfs2_start_trans(osb, NULL,
1379                                            OCFS2_TRUNCATE_LOG_UPDATE);
1380                 if (IS_ERR(handle)) {
1381                         status = PTR_ERR(handle);
1382                         mlog_errno(status);
1383                         goto bail_up;
1384                 }
1385
1386                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
1387                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
1388                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
1389
1390                 status = ocfs2_truncate_log_append(osb, handle,
1391                                                    start_blk, clusters);
1392                 ocfs2_commit_trans(handle);
1393                 if (status < 0) {
1394                         mlog_errno(status);
1395                         goto bail_up;
1396                 }
1397         }
1398
1399 bail_up:
1400         mutex_unlock(&tl_inode->i_mutex);
1401
1402         mlog_exit(status);
1403         return status;
1404 }
1405
1406 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
1407 {
1408         int status;
1409         struct inode *tl_inode = osb->osb_tl_inode;
1410
1411         mlog_entry_void();
1412
1413         if (tl_inode) {
1414                 cancel_delayed_work(&osb->osb_truncate_log_wq);
1415                 flush_workqueue(ocfs2_wq);
1416
1417                 status = ocfs2_flush_truncate_log(osb);
1418                 if (status < 0)
1419                         mlog_errno(status);
1420
1421                 brelse(osb->osb_tl_bh);
1422                 iput(osb->osb_tl_inode);
1423         }
1424
1425         mlog_exit_void();
1426 }
1427
1428 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
1429 {
1430         int status;
1431         struct inode *tl_inode = NULL;
1432         struct buffer_head *tl_bh = NULL;
1433
1434         mlog_entry_void();
1435
1436         status = ocfs2_get_truncate_log_info(osb,
1437                                              osb->slot_num,
1438                                              &tl_inode,
1439                                              &tl_bh);
1440         if (status < 0)
1441                 mlog_errno(status);
1442
1443         /* ocfs2_truncate_log_shutdown keys on the existence of
1444          * osb->osb_tl_inode so we don't set any of the osb variables
1445          * until we're sure all is well. */
1446         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
1447                           ocfs2_truncate_log_worker);
1448         osb->osb_tl_bh    = tl_bh;
1449         osb->osb_tl_inode = tl_inode;
1450
1451         mlog_exit(status);
1452         return status;
1453 }
1454
1455 /* This function will figure out whether the currently last extent
1456  * block will be deleted, and if it will, what the new last extent
1457  * block will be so we can update his h_next_leaf_blk field, as well
1458  * as the dinodes i_last_eb_blk */
1459 static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
1460                                        struct inode *inode,
1461                                        struct ocfs2_dinode *fe,
1462                                        u32 new_i_clusters,
1463                                        struct buffer_head *old_last_eb,
1464                                        struct buffer_head **new_last_eb)
1465 {
1466         int i, status = 0;
1467         u64 block = 0;
1468         struct ocfs2_extent_block *eb;
1469         struct ocfs2_extent_list *el;
1470         struct buffer_head *bh = NULL;
1471
1472         *new_last_eb = NULL;
1473
1474         if (!OCFS2_IS_VALID_DINODE(fe)) {
1475                 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1476                 status = -EIO;
1477                 goto bail;
1478         }
1479
1480         /* we have no tree, so of course, no last_eb. */
1481         if (!fe->id2.i_list.l_tree_depth)
1482                 goto bail;
1483
1484         /* trunc to zero special case - this makes tree_depth = 0
1485          * regardless of what it is.  */
1486         if (!new_i_clusters)
1487                 goto bail;
1488
1489         eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
1490         el = &(eb->h_list);
1491         BUG_ON(!el->l_next_free_rec);
1492
1493         /* Make sure that this guy will actually be empty after we
1494          * clear away the data. */
1495         if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
1496                 goto bail;
1497
1498         /* Ok, at this point, we know that last_eb will definitely
1499          * change, so lets traverse the tree and find the second to
1500          * last extent block. */
1501         el = &(fe->id2.i_list);
1502         /* go down the tree, */
1503         do {
1504                 for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
1505                         if (le32_to_cpu(el->l_recs[i].e_cpos) <
1506                             new_i_clusters) {
1507                                 block = le64_to_cpu(el->l_recs[i].e_blkno);
1508                                 break;
1509                         }
1510                 }
1511                 BUG_ON(i < 0);
1512
1513                 if (bh) {
1514                         brelse(bh);
1515                         bh = NULL;
1516                 }
1517
1518                 status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
1519                                          inode);
1520                 if (status < 0) {
1521                         mlog_errno(status);
1522                         goto bail;
1523                 }
1524                 eb = (struct ocfs2_extent_block *) bh->b_data;
1525                 el = &eb->h_list;
1526                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1527                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1528                         status = -EIO;
1529                         goto bail;
1530                 }
1531         } while (el->l_tree_depth);
1532
1533         *new_last_eb = bh;
1534         get_bh(*new_last_eb);
1535         mlog(0, "returning block %llu\n",
1536              (unsigned long long)le64_to_cpu(eb->h_blkno));
1537 bail:
1538         if (bh)
1539                 brelse(bh);
1540
1541         return status;
1542 }
1543
1544 static int ocfs2_do_truncate(struct ocfs2_super *osb,
1545                              unsigned int clusters_to_del,
1546                              struct inode *inode,
1547                              struct buffer_head *fe_bh,
1548                              struct buffer_head *old_last_eb_bh,
1549                              struct ocfs2_journal_handle *handle,
1550                              struct ocfs2_truncate_context *tc)
1551 {
1552         int status, i, depth;
1553         struct ocfs2_dinode *fe;
1554         struct ocfs2_extent_block *eb;
1555         struct ocfs2_extent_block *last_eb = NULL;
1556         struct ocfs2_extent_list *el;
1557         struct buffer_head *eb_bh = NULL;
1558         struct buffer_head *last_eb_bh = NULL;
1559         u64 next_eb = 0;
1560         u64 delete_blk = 0;
1561
1562         fe = (struct ocfs2_dinode *) fe_bh->b_data;
1563
1564         status = ocfs2_find_new_last_ext_blk(osb,
1565                                              inode,
1566                                              fe,
1567                                              le32_to_cpu(fe->i_clusters) -
1568                                                         clusters_to_del,
1569                                              old_last_eb_bh,
1570                                              &last_eb_bh);
1571         if (status < 0) {
1572                 mlog_errno(status);
1573                 goto bail;
1574         }
1575         if (last_eb_bh)
1576                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1577
1578         status = ocfs2_journal_access(handle, inode, fe_bh,
1579                                       OCFS2_JOURNAL_ACCESS_WRITE);
1580         if (status < 0) {
1581                 mlog_errno(status);
1582                 goto bail;
1583         }
1584         el = &(fe->id2.i_list);
1585
1586         spin_lock(&OCFS2_I(inode)->ip_lock);
1587         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
1588                                       clusters_to_del;
1589         spin_unlock(&OCFS2_I(inode)->ip_lock);
1590         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
1591         fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
1592         fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
1593
1594         i = le16_to_cpu(el->l_next_free_rec) - 1;
1595
1596         BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
1597         le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
1598         /* tree depth zero, we can just delete the clusters, otherwise
1599          * we need to record the offset of the next level extent block
1600          * as we may overwrite it. */
1601         if (!el->l_tree_depth)
1602                 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1603                         + ocfs2_clusters_to_blocks(osb->sb,
1604                                         le32_to_cpu(el->l_recs[i].e_clusters));
1605         else
1606                 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1607
1608         if (!el->l_recs[i].e_clusters) {
1609                 /* if we deleted the whole extent record, then clear
1610                  * out the other fields and update the extent
1611                  * list. For depth > 0 trees, we've already recorded
1612                  * the extent block in 'next_eb' */
1613                 el->l_recs[i].e_cpos = 0;
1614                 el->l_recs[i].e_blkno = 0;
1615                 BUG_ON(!el->l_next_free_rec);
1616                 le16_add_cpu(&el->l_next_free_rec, -1);
1617         }
1618
1619         depth = le16_to_cpu(el->l_tree_depth);
1620         if (!fe->i_clusters) {
1621                 /* trunc to zero is a special case. */
1622                 el->l_tree_depth = 0;
1623                 fe->i_last_eb_blk = 0;
1624         } else if (last_eb)
1625                 fe->i_last_eb_blk = last_eb->h_blkno;
1626
1627         status = ocfs2_journal_dirty(handle, fe_bh);
1628         if (status < 0) {
1629                 mlog_errno(status);
1630                 goto bail;
1631         }
1632
1633         if (last_eb) {
1634                 /* If there will be a new last extent block, then by
1635                  * definition, there cannot be any leaves to the right of
1636                  * him. */
1637                 status = ocfs2_journal_access(handle, inode, last_eb_bh,
1638                                               OCFS2_JOURNAL_ACCESS_WRITE);
1639                 if (status < 0) {
1640                         mlog_errno(status);
1641                         goto bail;
1642                 }
1643                 last_eb->h_next_leaf_blk = 0;
1644                 status = ocfs2_journal_dirty(handle, last_eb_bh);
1645                 if (status < 0) {
1646                         mlog_errno(status);
1647                         goto bail;
1648                 }
1649         }
1650
1651         /* if our tree depth > 0, update all the tree blocks below us. */
1652         while (depth) {
1653                 mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n",
1654                      depth,  (unsigned long long)next_eb);
1655                 status = ocfs2_read_block(osb, next_eb, &eb_bh,
1656                                           OCFS2_BH_CACHED, inode);
1657                 if (status < 0) {
1658                         mlog_errno(status);
1659                         goto bail;
1660                 }
1661                 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
1662                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1663                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1664                         status = -EIO;
1665                         goto bail;
1666                 }
1667                 el = &(eb->h_list);
1668
1669                 status = ocfs2_journal_access(handle, inode, eb_bh,
1670                                               OCFS2_JOURNAL_ACCESS_WRITE);
1671                 if (status < 0) {
1672                         mlog_errno(status);
1673                         goto bail;
1674                 }
1675
1676                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1677                 BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
1678
1679                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1680
1681                 mlog(0, "extent block %llu, before: record %d: "
1682                      "(%u, %u, %llu), next = %u\n",
1683                      (unsigned long long)le64_to_cpu(eb->h_blkno), i,
1684                      le32_to_cpu(el->l_recs[i].e_cpos),
1685                      le32_to_cpu(el->l_recs[i].e_clusters),
1686                      (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
1687                      le16_to_cpu(el->l_next_free_rec));
1688
1689                 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
1690                 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
1691
1692                 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1693                 /* bottom-most block requires us to delete data.*/
1694                 if (!el->l_tree_depth)
1695                         delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1696                                 + ocfs2_clusters_to_blocks(osb->sb,
1697                                         le32_to_cpu(el->l_recs[i].e_clusters));
1698                 if (!el->l_recs[i].e_clusters) {
1699                         el->l_recs[i].e_cpos = 0;
1700                         el->l_recs[i].e_blkno = 0;
1701                         BUG_ON(!el->l_next_free_rec);
1702                         le16_add_cpu(&el->l_next_free_rec, -1);
1703                 }
1704                 mlog(0, "extent block %llu, after: record %d: "
1705                      "(%u, %u, %llu), next = %u\n",
1706                      (unsigned long long)le64_to_cpu(eb->h_blkno), i,
1707                      le32_to_cpu(el->l_recs[i].e_cpos),
1708                      le32_to_cpu(el->l_recs[i].e_clusters),
1709                      (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
1710                      le16_to_cpu(el->l_next_free_rec));
1711
1712                 status = ocfs2_journal_dirty(handle, eb_bh);
1713                 if (status < 0) {
1714                         mlog_errno(status);
1715                         goto bail;
1716                 }
1717
1718                 if (!el->l_next_free_rec) {
1719                         mlog(0, "deleting this extent block.\n");
1720
1721                         ocfs2_remove_from_cache(inode, eb_bh);
1722
1723                         BUG_ON(el->l_recs[0].e_clusters);
1724                         BUG_ON(el->l_recs[0].e_cpos);
1725                         BUG_ON(el->l_recs[0].e_blkno);
1726                         if (eb->h_suballoc_slot == 0) {
1727                                 /*
1728                                  * This code only understands how to
1729                                  * lock the suballocator in slot 0,
1730                                  * which is fine because allocation is
1731                                  * only ever done out of that
1732                                  * suballocator too. A future version
1733                                  * might change that however, so avoid
1734                                  * a free if we don't know how to
1735                                  * handle it. This way an fs incompat
1736                                  * bit will not be necessary.
1737                                  */
1738                                 status = ocfs2_free_extent_block(handle,
1739                                                                  tc->tc_ext_alloc_inode,
1740                                                                  tc->tc_ext_alloc_bh,
1741                                                                  eb);
1742                                 if (status < 0) {
1743                                         mlog_errno(status);
1744                                         goto bail;
1745                                 }
1746                         }
1747                 }
1748                 brelse(eb_bh);
1749                 eb_bh = NULL;
1750                 depth--;
1751         }
1752
1753         BUG_ON(!delete_blk);
1754         status = ocfs2_truncate_log_append(osb, handle, delete_blk,
1755                                            clusters_to_del);
1756         if (status < 0) {
1757                 mlog_errno(status);
1758                 goto bail;
1759         }
1760         status = 0;
1761 bail:
1762         if (!status)
1763                 ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
1764         else
1765                 ocfs2_extent_map_drop(inode, 0);
1766         mlog_exit(status);
1767         return status;
1768 }
1769
1770 /*
1771  * It is expected, that by the time you call this function,
1772  * inode->i_size and fe->i_size have been adjusted.
1773  *
1774  * WARNING: This will kfree the truncate context
1775  */
1776 int ocfs2_commit_truncate(struct ocfs2_super *osb,
1777                           struct inode *inode,
1778                           struct buffer_head *fe_bh,
1779                           struct ocfs2_truncate_context *tc)
1780 {
1781         int status, i, credits, tl_sem = 0;
1782         u32 clusters_to_del, target_i_clusters;
1783         u64 last_eb = 0;
1784         struct ocfs2_dinode *fe;
1785         struct ocfs2_extent_block *eb;
1786         struct ocfs2_extent_list *el;
1787         struct buffer_head *last_eb_bh;
1788         struct ocfs2_journal_handle *handle = NULL;
1789         struct inode *tl_inode = osb->osb_tl_inode;
1790
1791         mlog_entry_void();
1792
1793         down_write(&OCFS2_I(inode)->ip_alloc_sem);
1794
1795         target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
1796                                                      i_size_read(inode));
1797
1798         last_eb_bh = tc->tc_last_eb_bh;
1799         tc->tc_last_eb_bh = NULL;
1800
1801         fe = (struct ocfs2_dinode *) fe_bh->b_data;
1802
1803         if (fe->id2.i_list.l_tree_depth) {
1804                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1805                 el = &eb->h_list;
1806         } else
1807                 el = &fe->id2.i_list;
1808         last_eb = le64_to_cpu(fe->i_last_eb_blk);
1809 start:
1810         mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
1811              "last_eb = %llu, fe->i_last_eb_blk = %llu, "
1812              "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
1813              le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb,
1814              (unsigned long long)le64_to_cpu(fe->i_last_eb_blk),
1815              le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
1816
1817         if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
1818                 mlog(0, "last_eb changed!\n");
1819                 BUG_ON(!fe->id2.i_list.l_tree_depth);
1820                 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1821                 /* i_last_eb_blk may have changed, read it if
1822                  * necessary. We don't have to worry about the
1823                  * truncate to zero case here (where there becomes no
1824                  * last_eb) because we never loop back after our work
1825                  * is done. */
1826                 if (last_eb_bh) {
1827                         brelse(last_eb_bh);
1828                         last_eb_bh = NULL;
1829                 }
1830
1831                 status = ocfs2_read_block(osb, last_eb,
1832                                           &last_eb_bh, OCFS2_BH_CACHED,
1833                                           inode);
1834                 if (status < 0) {
1835                         mlog_errno(status);
1836                         goto bail;
1837                 }
1838                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1839                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1840                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1841                         status = -EIO;
1842                         goto bail;
1843                 }
1844                 el = &(eb->h_list);
1845         }
1846
1847         /* by now, el will point to the extent list on the bottom most
1848          * portion of this tree. */
1849         i = le16_to_cpu(el->l_next_free_rec) - 1;
1850         if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
1851                 clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
1852         else
1853                 clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
1854                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
1855                                   target_i_clusters;
1856
1857         mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
1858
1859         mutex_lock(&tl_inode->i_mutex);
1860         tl_sem = 1;
1861         /* ocfs2_truncate_log_needs_flush guarantees us at least one
1862          * record is free for use. If there isn't any, we flush to get
1863          * an empty truncate log.  */
1864         if (ocfs2_truncate_log_needs_flush(osb)) {
1865                 status = __ocfs2_flush_truncate_log(osb);
1866                 if (status < 0) {
1867                         mlog_errno(status);
1868                         goto bail;
1869                 }
1870         }
1871
1872         credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
1873                                                 fe, el);
1874         handle = ocfs2_start_trans(osb, NULL, credits);
1875         if (IS_ERR(handle)) {
1876                 status = PTR_ERR(handle);
1877                 handle = NULL;
1878                 mlog_errno(status);
1879                 goto bail;
1880         }
1881
1882         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1883         status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
1884         if (status < 0)
1885                 mlog_errno(status);
1886
1887         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
1888                                    last_eb_bh, handle, tc);
1889         if (status < 0) {
1890                 mlog_errno(status);
1891                 goto bail;
1892         }
1893
1894         mutex_unlock(&tl_inode->i_mutex);
1895         tl_sem = 0;
1896
1897         ocfs2_commit_trans(handle);
1898         handle = NULL;
1899
1900         BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
1901         if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
1902                 goto start;
1903 bail:
1904         up_write(&OCFS2_I(inode)->ip_alloc_sem);
1905
1906         ocfs2_schedule_truncate_log_flush(osb, 1);
1907
1908         if (tl_sem)
1909                 mutex_unlock(&tl_inode->i_mutex);
1910
1911         if (handle)
1912                 ocfs2_commit_trans(handle);
1913
1914         if (last_eb_bh)
1915                 brelse(last_eb_bh);
1916
1917         /* This will drop the ext_alloc cluster lock for us */
1918         ocfs2_free_truncate_context(tc);
1919
1920         mlog_exit(status);
1921         return status;
1922 }
1923
1924
1925 /*
1926  * Expects the inode to already be locked. This will figure out which
1927  * inodes need to be locked and will put them on the returned truncate
1928  * context.
1929  */
1930 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
1931                            struct inode *inode,
1932                            struct buffer_head *fe_bh,
1933                            struct ocfs2_truncate_context **tc)
1934 {
1935         int status, metadata_delete;
1936         unsigned int new_i_clusters;
1937         struct ocfs2_dinode *fe;
1938         struct ocfs2_extent_block *eb;
1939         struct ocfs2_extent_list *el;
1940         struct buffer_head *last_eb_bh = NULL;
1941         struct inode *ext_alloc_inode = NULL;
1942         struct buffer_head *ext_alloc_bh = NULL;
1943
1944         mlog_entry_void();
1945
1946         *tc = NULL;
1947
1948         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
1949                                                   i_size_read(inode));
1950         fe = (struct ocfs2_dinode *) fe_bh->b_data;
1951
1952         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
1953              "%llu\n", fe->i_clusters, new_i_clusters,
1954              (unsigned long long)fe->i_size);
1955
1956         if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
1957                 ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
1958                             "%u and size %llu whereas struct inode has "
1959                             "cluster count %u and size %llu which caused an "
1960                             "invalid truncate to %u clusters.",
1961                             (unsigned long long)le64_to_cpu(fe->i_blkno),
1962                             le32_to_cpu(fe->i_clusters),
1963                             (unsigned long long)le64_to_cpu(fe->i_size),
1964                             OCFS2_I(inode)->ip_clusters, i_size_read(inode),
1965                             new_i_clusters);
1966                 mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
1967                 status = -EIO;
1968                 goto bail;
1969         }
1970
1971         *tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
1972         if (!(*tc)) {
1973                 status = -ENOMEM;
1974                 mlog_errno(status);
1975                 goto bail;
1976         }
1977
1978         metadata_delete = 0;
1979         if (fe->id2.i_list.l_tree_depth) {
1980                 /* If we have a tree, then the truncate may result in
1981                  * metadata deletes. Figure this out from the
1982                  * rightmost leaf block.*/
1983                 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
1984                                           &last_eb_bh, OCFS2_BH_CACHED, inode);
1985                 if (status < 0) {
1986                         mlog_errno(status);
1987                         goto bail;
1988                 }
1989                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1990                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1991                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1992
1993                         brelse(last_eb_bh);
1994                         status = -EIO;
1995                         goto bail;
1996                 }
1997                 el = &(eb->h_list);
1998                 if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
1999                         metadata_delete = 1;
2000         }
2001
2002         (*tc)->tc_last_eb_bh = last_eb_bh;
2003
2004         if (metadata_delete) {
2005                 mlog(0, "Will have to delete metadata for this trunc. "
2006                      "locking allocator.\n");
2007                 ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
2008                 if (!ext_alloc_inode) {
2009                         status = -ENOMEM;
2010                         mlog_errno(status);
2011                         goto bail;
2012                 }
2013
2014                 mutex_lock(&ext_alloc_inode->i_mutex);
2015                 (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
2016
2017                 status = ocfs2_meta_lock(ext_alloc_inode,
2018                                          NULL,
2019                                          &ext_alloc_bh,
2020                                          1);
2021                 if (status < 0) {
2022                         mlog_errno(status);
2023                         goto bail;
2024                 }
2025                 (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
2026                 (*tc)->tc_ext_alloc_locked = 1;
2027         }
2028
2029         status = 0;
2030 bail:
2031         if (status < 0) {
2032                 if (*tc)
2033                         ocfs2_free_truncate_context(*tc);
2034                 *tc = NULL;
2035         }
2036         mlog_exit_void();
2037         return status;
2038 }
2039
2040 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
2041 {
2042         if (tc->tc_ext_alloc_inode) {
2043                 if (tc->tc_ext_alloc_locked)
2044                         ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
2045
2046                 mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
2047                 iput(tc->tc_ext_alloc_inode);
2048         }
2049
2050         if (tc->tc_ext_alloc_bh)
2051                 brelse(tc->tc_ext_alloc_bh);
2052
2053         if (tc->tc_last_eb_bh)
2054                 brelse(tc->tc_last_eb_bh);
2055
2056         kfree(tc);
2057 }