ocfs2: Validate metadata only when it's read from disk.
[safe/jmp/linux-2.6] / fs / ocfs2 / suballoc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "dlmglue.h"
39 #include "inode.h"
40 #include "journal.h"
41 #include "localalloc.h"
42 #include "suballoc.h"
43 #include "super.h"
44 #include "sysfile.h"
45 #include "uptodate.h"
46
47 #include "buffer_head_io.h"
48
49 #define NOT_ALLOC_NEW_GROUP             0
50 #define ALLOC_NEW_GROUP                 1
51
52 #define OCFS2_MAX_INODES_TO_STEAL       1024
53
54 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
55 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
56 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
57 static int ocfs2_block_group_fill(handle_t *handle,
58                                   struct inode *alloc_inode,
59                                   struct buffer_head *bg_bh,
60                                   u64 group_blkno,
61                                   u16 my_chain,
62                                   struct ocfs2_chain_list *cl);
63 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
64                                    struct inode *alloc_inode,
65                                    struct buffer_head *bh,
66                                    u64 max_block);
67
68 static int ocfs2_cluster_group_search(struct inode *inode,
69                                       struct buffer_head *group_bh,
70                                       u32 bits_wanted, u32 min_bits,
71                                       u64 max_block,
72                                       u16 *bit_off, u16 *bits_found);
73 static int ocfs2_block_group_search(struct inode *inode,
74                                     struct buffer_head *group_bh,
75                                     u32 bits_wanted, u32 min_bits,
76                                     u64 max_block,
77                                     u16 *bit_off, u16 *bits_found);
78 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
79                                      struct ocfs2_alloc_context *ac,
80                                      handle_t *handle,
81                                      u32 bits_wanted,
82                                      u32 min_bits,
83                                      u16 *bit_off,
84                                      unsigned int *num_bits,
85                                      u64 *bg_blkno);
86 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
87                                          int nr);
88 static inline int ocfs2_block_group_set_bits(handle_t *handle,
89                                              struct inode *alloc_inode,
90                                              struct ocfs2_group_desc *bg,
91                                              struct buffer_head *group_bh,
92                                              unsigned int bit_off,
93                                              unsigned int num_bits);
94 static inline int ocfs2_block_group_clear_bits(handle_t *handle,
95                                                struct inode *alloc_inode,
96                                                struct ocfs2_group_desc *bg,
97                                                struct buffer_head *group_bh,
98                                                unsigned int bit_off,
99                                                unsigned int num_bits);
100
101 static int ocfs2_relink_block_group(handle_t *handle,
102                                     struct inode *alloc_inode,
103                                     struct buffer_head *fe_bh,
104                                     struct buffer_head *bg_bh,
105                                     struct buffer_head *prev_bg_bh,
106                                     u16 chain);
107 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
108                                                      u32 wanted);
109 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
110                                                    u64 bg_blkno,
111                                                    u16 bg_bit_off);
112 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
113                                                 u64 data_blkno,
114                                                 u64 *bg_blkno,
115                                                 u16 *bg_bit_off);
116 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
117                                              u32 bits_wanted, u64 max_block,
118                                              struct ocfs2_alloc_context **ac);
119
120 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
121 {
122         struct inode *inode = ac->ac_inode;
123
124         if (inode) {
125                 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
126                         ocfs2_inode_unlock(inode, 1);
127
128                 mutex_unlock(&inode->i_mutex);
129
130                 iput(inode);
131                 ac->ac_inode = NULL;
132         }
133         brelse(ac->ac_bh);
134         ac->ac_bh = NULL;
135 }
136
137 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
138 {
139         ocfs2_free_ac_resource(ac);
140         kfree(ac);
141 }
142
143 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
144 {
145         return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
146 }
147
148 #define do_error(fmt, ...)                                              \
149         do{                                                             \
150                 if (clean_error)                                        \
151                         mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
152                 else                                                    \
153                         ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
154         } while (0)
155
156 static int ocfs2_validate_gd_self(struct super_block *sb,
157                                   struct buffer_head *bh,
158                                   int clean_error)
159 {
160         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
161
162         if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
163                 do_error("Group descriptor #%llu has bad signature %.*s",
164                          (unsigned long long)bh->b_blocknr, 7,
165                          gd->bg_signature);
166                 return -EINVAL;
167         }
168
169         if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
170                 do_error("Group descriptor #%llu has an invalid bg_blkno "
171                          "of %llu",
172                          (unsigned long long)bh->b_blocknr,
173                          (unsigned long long)le64_to_cpu(gd->bg_blkno));
174                 return -EINVAL;
175         }
176
177         if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
178                 do_error("Group descriptor #%llu has an invalid "
179                          "fs_generation of #%u",
180                          (unsigned long long)bh->b_blocknr,
181                          le32_to_cpu(gd->bg_generation));
182                 return -EINVAL;
183         }
184
185         if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
186                 do_error("Group descriptor #%llu has bit count %u but "
187                          "claims that %u are free",
188                          (unsigned long long)bh->b_blocknr,
189                          le16_to_cpu(gd->bg_bits),
190                          le16_to_cpu(gd->bg_free_bits_count));
191                 return -EINVAL;
192         }
193
194         if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
195                 do_error("Group descriptor #%llu has bit count %u but "
196                          "max bitmap bits of %u",
197                          (unsigned long long)bh->b_blocknr,
198                          le16_to_cpu(gd->bg_bits),
199                          8 * le16_to_cpu(gd->bg_size));
200                 return -EINVAL;
201         }
202
203         return 0;
204 }
205
206 static int ocfs2_validate_gd_parent(struct super_block *sb,
207                                     struct ocfs2_dinode *di,
208                                     struct buffer_head *bh,
209                                     int clean_error)
210 {
211         unsigned int max_bits;
212         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
213
214         if (di->i_blkno != gd->bg_parent_dinode) {
215                 do_error("Group descriptor #%llu has bad parent "
216                          "pointer (%llu, expected %llu)",
217                          (unsigned long long)bh->b_blocknr,
218                          (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
219                          (unsigned long long)le64_to_cpu(di->i_blkno));
220                 return -EINVAL;
221         }
222
223         max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
224         if (le16_to_cpu(gd->bg_bits) > max_bits) {
225                 do_error("Group descriptor #%llu has bit count of %u",
226                          (unsigned long long)bh->b_blocknr,
227                          le16_to_cpu(gd->bg_bits));
228                 return -EINVAL;
229         }
230
231         if (le16_to_cpu(gd->bg_chain) >=
232             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
233                 do_error("Group descriptor #%llu has bad chain %u",
234                          (unsigned long long)bh->b_blocknr,
235                          le16_to_cpu(gd->bg_chain));
236                 return -EINVAL;
237         }
238
239         return 0;
240 }
241
242 #undef do_error
243
244 /*
245  * This version only prints errors.  It does not fail the filesystem, and
246  * exists only for resize.
247  */
248 int ocfs2_check_group_descriptor(struct super_block *sb,
249                                  struct ocfs2_dinode *di,
250                                  struct buffer_head *bh)
251 {
252         int rc;
253
254         rc = ocfs2_validate_gd_self(sb, bh, 1);
255         if (!rc)
256                 rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
257
258         return rc;
259 }
260
261 static int ocfs2_validate_group_descriptor(struct super_block *sb,
262                                            struct buffer_head *bh)
263 {
264         mlog(0, "Validating group descriptor %llu\n",
265              (unsigned long long)bh->b_blocknr);
266
267         return ocfs2_validate_gd_self(sb, bh, 0);
268 }
269
270 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
271                                 u64 gd_blkno, struct buffer_head **bh)
272 {
273         int rc;
274         struct buffer_head *tmp = *bh;
275
276         rc = ocfs2_read_block(inode, gd_blkno, &tmp,
277                               ocfs2_validate_group_descriptor);
278         if (rc)
279                 goto out;
280
281         rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
282         if (rc) {
283                 brelse(tmp);
284                 goto out;
285         }
286
287         /* If ocfs2_read_block() got us a new bh, pass it up. */
288         if (!*bh)
289                 *bh = tmp;
290
291 out:
292         return rc;
293 }
294
295 static int ocfs2_block_group_fill(handle_t *handle,
296                                   struct inode *alloc_inode,
297                                   struct buffer_head *bg_bh,
298                                   u64 group_blkno,
299                                   u16 my_chain,
300                                   struct ocfs2_chain_list *cl)
301 {
302         int status = 0;
303         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
304         struct super_block * sb = alloc_inode->i_sb;
305
306         mlog_entry_void();
307
308         if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
309                 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
310                             "b_blocknr (%llu)",
311                             (unsigned long long)group_blkno,
312                             (unsigned long long) bg_bh->b_blocknr);
313                 status = -EIO;
314                 goto bail;
315         }
316
317         status = ocfs2_journal_access(handle,
318                                       alloc_inode,
319                                       bg_bh,
320                                       OCFS2_JOURNAL_ACCESS_CREATE);
321         if (status < 0) {
322                 mlog_errno(status);
323                 goto bail;
324         }
325
326         memset(bg, 0, sb->s_blocksize);
327         strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
328         bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
329         bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
330         bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
331         bg->bg_chain = cpu_to_le16(my_chain);
332         bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
333         bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
334         bg->bg_blkno = cpu_to_le64(group_blkno);
335         /* set the 1st bit in the bitmap to account for the descriptor block */
336         ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
337         bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
338
339         status = ocfs2_journal_dirty(handle, bg_bh);
340         if (status < 0)
341                 mlog_errno(status);
342
343         /* There is no need to zero out or otherwise initialize the
344          * other blocks in a group - All valid FS metadata in a block
345          * group stores the superblock fs_generation value at
346          * allocation time. */
347
348 bail:
349         mlog_exit(status);
350         return status;
351 }
352
353 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
354 {
355         u16 curr, best;
356
357         best = curr = 0;
358         while (curr < le16_to_cpu(cl->cl_count)) {
359                 if (le32_to_cpu(cl->cl_recs[best].c_total) >
360                     le32_to_cpu(cl->cl_recs[curr].c_total))
361                         best = curr;
362                 curr++;
363         }
364         return best;
365 }
366
367 /*
368  * We expect the block group allocator to already be locked.
369  */
370 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
371                                    struct inode *alloc_inode,
372                                    struct buffer_head *bh,
373                                    u64 max_block)
374 {
375         int status, credits;
376         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
377         struct ocfs2_chain_list *cl;
378         struct ocfs2_alloc_context *ac = NULL;
379         handle_t *handle = NULL;
380         u32 bit_off, num_bits;
381         u16 alloc_rec;
382         u64 bg_blkno;
383         struct buffer_head *bg_bh = NULL;
384         struct ocfs2_group_desc *bg;
385
386         BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
387
388         mlog_entry_void();
389
390         cl = &fe->id2.i_chain;
391         status = ocfs2_reserve_clusters_with_limit(osb,
392                                                    le16_to_cpu(cl->cl_cpg),
393                                                    max_block, &ac);
394         if (status < 0) {
395                 if (status != -ENOSPC)
396                         mlog_errno(status);
397                 goto bail;
398         }
399
400         credits = ocfs2_calc_group_alloc_credits(osb->sb,
401                                                  le16_to_cpu(cl->cl_cpg));
402         handle = ocfs2_start_trans(osb, credits);
403         if (IS_ERR(handle)) {
404                 status = PTR_ERR(handle);
405                 handle = NULL;
406                 mlog_errno(status);
407                 goto bail;
408         }
409
410         status = ocfs2_claim_clusters(osb,
411                                       handle,
412                                       ac,
413                                       le16_to_cpu(cl->cl_cpg),
414                                       &bit_off,
415                                       &num_bits);
416         if (status < 0) {
417                 if (status != -ENOSPC)
418                         mlog_errno(status);
419                 goto bail;
420         }
421
422         alloc_rec = ocfs2_find_smallest_chain(cl);
423
424         /* setup the group */
425         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
426         mlog(0, "new descriptor, record %u, at block %llu\n",
427              alloc_rec, (unsigned long long)bg_blkno);
428
429         bg_bh = sb_getblk(osb->sb, bg_blkno);
430         if (!bg_bh) {
431                 status = -EIO;
432                 mlog_errno(status);
433                 goto bail;
434         }
435         ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
436
437         status = ocfs2_block_group_fill(handle,
438                                         alloc_inode,
439                                         bg_bh,
440                                         bg_blkno,
441                                         alloc_rec,
442                                         cl);
443         if (status < 0) {
444                 mlog_errno(status);
445                 goto bail;
446         }
447
448         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
449
450         status = ocfs2_journal_access(handle, alloc_inode,
451                                       bh, OCFS2_JOURNAL_ACCESS_WRITE);
452         if (status < 0) {
453                 mlog_errno(status);
454                 goto bail;
455         }
456
457         le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
458                      le16_to_cpu(bg->bg_free_bits_count));
459         le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
460         cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
461         if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
462                 le16_add_cpu(&cl->cl_next_free_rec, 1);
463
464         le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
465                                         le16_to_cpu(bg->bg_free_bits_count));
466         le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
467         le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
468
469         status = ocfs2_journal_dirty(handle, bh);
470         if (status < 0) {
471                 mlog_errno(status);
472                 goto bail;
473         }
474
475         spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
476         OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
477         fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
478                                              le32_to_cpu(fe->i_clusters)));
479         spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
480         i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
481         alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
482
483         status = 0;
484 bail:
485         if (handle)
486                 ocfs2_commit_trans(osb, handle);
487
488         if (ac)
489                 ocfs2_free_alloc_context(ac);
490
491         brelse(bg_bh);
492
493         mlog_exit(status);
494         return status;
495 }
496
497 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
498                                        struct ocfs2_alloc_context *ac,
499                                        int type,
500                                        u32 slot,
501                                        int alloc_new_group)
502 {
503         int status;
504         u32 bits_wanted = ac->ac_bits_wanted;
505         struct inode *alloc_inode;
506         struct buffer_head *bh = NULL;
507         struct ocfs2_dinode *fe;
508         u32 free_bits;
509
510         mlog_entry_void();
511
512         alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
513         if (!alloc_inode) {
514                 mlog_errno(-EINVAL);
515                 return -EINVAL;
516         }
517
518         mutex_lock(&alloc_inode->i_mutex);
519
520         status = ocfs2_inode_lock(alloc_inode, &bh, 1);
521         if (status < 0) {
522                 mutex_unlock(&alloc_inode->i_mutex);
523                 iput(alloc_inode);
524
525                 mlog_errno(status);
526                 return status;
527         }
528
529         ac->ac_inode = alloc_inode;
530         ac->ac_alloc_slot = slot;
531
532         fe = (struct ocfs2_dinode *) bh->b_data;
533
534         /* The bh was validated by the inode read inside
535          * ocfs2_inode_lock().  Any corruption is a code bug. */
536         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
537
538         if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
539                 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
540                             (unsigned long long)le64_to_cpu(fe->i_blkno));
541                 status = -EIO;
542                 goto bail;
543         }
544
545         free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
546                 le32_to_cpu(fe->id1.bitmap1.i_used);
547
548         if (bits_wanted > free_bits) {
549                 /* cluster bitmap never grows */
550                 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
551                         mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
552                              bits_wanted, free_bits);
553                         status = -ENOSPC;
554                         goto bail;
555                 }
556
557                 if (alloc_new_group != ALLOC_NEW_GROUP) {
558                         mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
559                              "and we don't alloc a new group for it.\n",
560                              slot, bits_wanted, free_bits);
561                         status = -ENOSPC;
562                         goto bail;
563                 }
564
565                 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
566                                                  ac->ac_max_block);
567                 if (status < 0) {
568                         if (status != -ENOSPC)
569                                 mlog_errno(status);
570                         goto bail;
571                 }
572                 atomic_inc(&osb->alloc_stats.bg_extends);
573
574                 /* You should never ask for this much metadata */
575                 BUG_ON(bits_wanted >
576                        (le32_to_cpu(fe->id1.bitmap1.i_total)
577                         - le32_to_cpu(fe->id1.bitmap1.i_used)));
578         }
579
580         get_bh(bh);
581         ac->ac_bh = bh;
582 bail:
583         brelse(bh);
584
585         mlog_exit(status);
586         return status;
587 }
588
589 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
590                                       int blocks,
591                                       struct ocfs2_alloc_context **ac)
592 {
593         int status;
594         u32 slot;
595
596         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
597         if (!(*ac)) {
598                 status = -ENOMEM;
599                 mlog_errno(status);
600                 goto bail;
601         }
602
603         (*ac)->ac_bits_wanted = blocks;
604         (*ac)->ac_which = OCFS2_AC_USE_META;
605         slot = osb->slot_num;
606         (*ac)->ac_group_search = ocfs2_block_group_search;
607
608         status = ocfs2_reserve_suballoc_bits(osb, (*ac),
609                                              EXTENT_ALLOC_SYSTEM_INODE,
610                                              slot, ALLOC_NEW_GROUP);
611         if (status < 0) {
612                 if (status != -ENOSPC)
613                         mlog_errno(status);
614                 goto bail;
615         }
616
617         status = 0;
618 bail:
619         if ((status < 0) && *ac) {
620                 ocfs2_free_alloc_context(*ac);
621                 *ac = NULL;
622         }
623
624         mlog_exit(status);
625         return status;
626 }
627
628 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
629                                struct ocfs2_extent_list *root_el,
630                                struct ocfs2_alloc_context **ac)
631 {
632         return ocfs2_reserve_new_metadata_blocks(osb,
633                                         ocfs2_extend_meta_needed(root_el),
634                                         ac);
635 }
636
637 static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
638                                               struct ocfs2_alloc_context *ac)
639 {
640         int i, status = -ENOSPC;
641         s16 slot = ocfs2_get_inode_steal_slot(osb);
642
643         /* Start to steal inodes from the first slot after ours. */
644         if (slot == OCFS2_INVALID_SLOT)
645                 slot = osb->slot_num + 1;
646
647         for (i = 0; i < osb->max_slots; i++, slot++) {
648                 if (slot == osb->max_slots)
649                         slot = 0;
650
651                 if (slot == osb->slot_num)
652                         continue;
653
654                 status = ocfs2_reserve_suballoc_bits(osb, ac,
655                                                      INODE_ALLOC_SYSTEM_INODE,
656                                                      slot, NOT_ALLOC_NEW_GROUP);
657                 if (status >= 0) {
658                         ocfs2_set_inode_steal_slot(osb, slot);
659                         break;
660                 }
661
662                 ocfs2_free_ac_resource(ac);
663         }
664
665         return status;
666 }
667
668 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
669                             struct ocfs2_alloc_context **ac)
670 {
671         int status;
672         s16 slot = ocfs2_get_inode_steal_slot(osb);
673
674         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
675         if (!(*ac)) {
676                 status = -ENOMEM;
677                 mlog_errno(status);
678                 goto bail;
679         }
680
681         (*ac)->ac_bits_wanted = 1;
682         (*ac)->ac_which = OCFS2_AC_USE_INODE;
683
684         (*ac)->ac_group_search = ocfs2_block_group_search;
685
686         /*
687          * stat(2) can't handle i_ino > 32bits, so we tell the
688          * lower levels not to allocate us a block group past that
689          * limit.  The 'inode64' mount option avoids this behavior.
690          */
691         if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
692                 (*ac)->ac_max_block = (u32)~0U;
693
694         /*
695          * slot is set when we successfully steal inode from other nodes.
696          * It is reset in 3 places:
697          * 1. when we flush the truncate log
698          * 2. when we complete local alloc recovery.
699          * 3. when we successfully allocate from our own slot.
700          * After it is set, we will go on stealing inodes until we find the
701          * need to check our slots to see whether there is some space for us.
702          */
703         if (slot != OCFS2_INVALID_SLOT &&
704             atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL)
705                 goto inode_steal;
706
707         atomic_set(&osb->s_num_inodes_stolen, 0);
708         status = ocfs2_reserve_suballoc_bits(osb, *ac,
709                                              INODE_ALLOC_SYSTEM_INODE,
710                                              osb->slot_num, ALLOC_NEW_GROUP);
711         if (status >= 0) {
712                 status = 0;
713
714                 /*
715                  * Some inodes must be freed by us, so try to allocate
716                  * from our own next time.
717                  */
718                 if (slot != OCFS2_INVALID_SLOT)
719                         ocfs2_init_inode_steal_slot(osb);
720                 goto bail;
721         } else if (status < 0 && status != -ENOSPC) {
722                 mlog_errno(status);
723                 goto bail;
724         }
725
726         ocfs2_free_ac_resource(*ac);
727
728 inode_steal:
729         status = ocfs2_steal_inode_from_other_nodes(osb, *ac);
730         atomic_inc(&osb->s_num_inodes_stolen);
731         if (status < 0) {
732                 if (status != -ENOSPC)
733                         mlog_errno(status);
734                 goto bail;
735         }
736
737         status = 0;
738 bail:
739         if ((status < 0) && *ac) {
740                 ocfs2_free_alloc_context(*ac);
741                 *ac = NULL;
742         }
743
744         mlog_exit(status);
745         return status;
746 }
747
748 /* local alloc code has to do the same thing, so rather than do this
749  * twice.. */
750 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
751                                       struct ocfs2_alloc_context *ac)
752 {
753         int status;
754
755         ac->ac_which = OCFS2_AC_USE_MAIN;
756         ac->ac_group_search = ocfs2_cluster_group_search;
757
758         status = ocfs2_reserve_suballoc_bits(osb, ac,
759                                              GLOBAL_BITMAP_SYSTEM_INODE,
760                                              OCFS2_INVALID_SLOT,
761                                              ALLOC_NEW_GROUP);
762         if (status < 0 && status != -ENOSPC) {
763                 mlog_errno(status);
764                 goto bail;
765         }
766
767 bail:
768         return status;
769 }
770
771 /* Callers don't need to care which bitmap (local alloc or main) to
772  * use so we figure it out for them, but unfortunately this clutters
773  * things a bit. */
774 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
775                                              u32 bits_wanted, u64 max_block,
776                                              struct ocfs2_alloc_context **ac)
777 {
778         int status;
779
780         mlog_entry_void();
781
782         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
783         if (!(*ac)) {
784                 status = -ENOMEM;
785                 mlog_errno(status);
786                 goto bail;
787         }
788
789         (*ac)->ac_bits_wanted = bits_wanted;
790         (*ac)->ac_max_block = max_block;
791
792         status = -ENOSPC;
793         if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
794                 status = ocfs2_reserve_local_alloc_bits(osb,
795                                                         bits_wanted,
796                                                         *ac);
797                 if (status == -EFBIG) {
798                         /* The local alloc window is outside ac_max_block.
799                          * use the main bitmap. */
800                         status = -ENOSPC;
801                 } else if ((status < 0) && (status != -ENOSPC)) {
802                         mlog_errno(status);
803                         goto bail;
804                 }
805         }
806
807         if (status == -ENOSPC) {
808                 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
809                 if (status < 0) {
810                         if (status != -ENOSPC)
811                                 mlog_errno(status);
812                         goto bail;
813                 }
814         }
815
816         status = 0;
817 bail:
818         if ((status < 0) && *ac) {
819                 ocfs2_free_alloc_context(*ac);
820                 *ac = NULL;
821         }
822
823         mlog_exit(status);
824         return status;
825 }
826
827 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
828                            u32 bits_wanted,
829                            struct ocfs2_alloc_context **ac)
830 {
831         return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac);
832 }
833
834 /*
835  * More or less lifted from ext3. I'll leave their description below:
836  *
837  * "For ext3 allocations, we must not reuse any blocks which are
838  * allocated in the bitmap buffer's "last committed data" copy.  This
839  * prevents deletes from freeing up the page for reuse until we have
840  * committed the delete transaction.
841  *
842  * If we didn't do this, then deleting something and reallocating it as
843  * data would allow the old block to be overwritten before the
844  * transaction committed (because we force data to disk before commit).
845  * This would lead to corruption if we crashed between overwriting the
846  * data and committing the delete.
847  *
848  * @@@ We may want to make this allocation behaviour conditional on
849  * data-writes at some point, and disable it for metadata allocations or
850  * sync-data inodes."
851  *
852  * Note: OCFS2 already does this differently for metadata vs data
853  * allocations, as those bitmaps are separate and undo access is never
854  * called on a metadata group descriptor.
855  */
856 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
857                                          int nr)
858 {
859         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
860
861         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
862                 return 0;
863         if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
864                 return 1;
865
866         bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
867         return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
868 }
869
870 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
871                                              struct buffer_head *bg_bh,
872                                              unsigned int bits_wanted,
873                                              unsigned int total_bits,
874                                              u16 *bit_off,
875                                              u16 *bits_found)
876 {
877         void *bitmap;
878         u16 best_offset, best_size;
879         int offset, start, found, status = 0;
880         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
881
882         /* Callers got this descriptor from
883          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
884         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
885
886         found = start = best_offset = best_size = 0;
887         bitmap = bg->bg_bitmap;
888
889         while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
890                 if (offset == total_bits)
891                         break;
892
893                 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
894                         /* We found a zero, but we can't use it as it
895                          * hasn't been put to disk yet! */
896                         found = 0;
897                         start = offset + 1;
898                 } else if (offset == start) {
899                         /* we found a zero */
900                         found++;
901                         /* move start to the next bit to test */
902                         start++;
903                 } else {
904                         /* got a zero after some ones */
905                         found = 1;
906                         start = offset + 1;
907                 }
908                 if (found > best_size) {
909                         best_size = found;
910                         best_offset = start - found;
911                 }
912                 /* we got everything we needed */
913                 if (found == bits_wanted) {
914                         /* mlog(0, "Found it all!\n"); */
915                         break;
916                 }
917         }
918
919         /* XXX: I think the first clause is equivalent to the second
920          *      - jlbec */
921         if (found == bits_wanted) {
922                 *bit_off = start - found;
923                 *bits_found = found;
924         } else if (best_size) {
925                 *bit_off = best_offset;
926                 *bits_found = best_size;
927         } else {
928                 status = -ENOSPC;
929                 /* No error log here -- see the comment above
930                  * ocfs2_test_bg_bit_allocatable */
931         }
932
933         return status;
934 }
935
936 static inline int ocfs2_block_group_set_bits(handle_t *handle,
937                                              struct inode *alloc_inode,
938                                              struct ocfs2_group_desc *bg,
939                                              struct buffer_head *group_bh,
940                                              unsigned int bit_off,
941                                              unsigned int num_bits)
942 {
943         int status;
944         void *bitmap = bg->bg_bitmap;
945         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
946
947         mlog_entry_void();
948
949         /* All callers get the descriptor via
950          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
951         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
952         BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
953
954         mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
955              num_bits);
956
957         if (ocfs2_is_cluster_bitmap(alloc_inode))
958                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
959
960         status = ocfs2_journal_access(handle,
961                                       alloc_inode,
962                                       group_bh,
963                                       journal_type);
964         if (status < 0) {
965                 mlog_errno(status);
966                 goto bail;
967         }
968
969         le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
970
971         while(num_bits--)
972                 ocfs2_set_bit(bit_off++, bitmap);
973
974         status = ocfs2_journal_dirty(handle,
975                                      group_bh);
976         if (status < 0) {
977                 mlog_errno(status);
978                 goto bail;
979         }
980
981 bail:
982         mlog_exit(status);
983         return status;
984 }
985
986 /* find the one with the most empty bits */
987 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
988 {
989         u16 curr, best;
990
991         BUG_ON(!cl->cl_next_free_rec);
992
993         best = curr = 0;
994         while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
995                 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
996                     le32_to_cpu(cl->cl_recs[best].c_free))
997                         best = curr;
998                 curr++;
999         }
1000
1001         BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1002         return best;
1003 }
1004
1005 static int ocfs2_relink_block_group(handle_t *handle,
1006                                     struct inode *alloc_inode,
1007                                     struct buffer_head *fe_bh,
1008                                     struct buffer_head *bg_bh,
1009                                     struct buffer_head *prev_bg_bh,
1010                                     u16 chain)
1011 {
1012         int status;
1013         /* there is a really tiny chance the journal calls could fail,
1014          * but we wouldn't want inconsistent blocks in *any* case. */
1015         u64 fe_ptr, bg_ptr, prev_bg_ptr;
1016         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1017         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1018         struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1019
1020         /* The caller got these descriptors from
1021          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1022         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1023         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1024
1025         mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
1026              (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1027              (unsigned long long)le64_to_cpu(bg->bg_blkno),
1028              (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1029
1030         fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
1031         bg_ptr = le64_to_cpu(bg->bg_next_group);
1032         prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1033
1034         status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
1035                                       OCFS2_JOURNAL_ACCESS_WRITE);
1036         if (status < 0) {
1037                 mlog_errno(status);
1038                 goto out_rollback;
1039         }
1040
1041         prev_bg->bg_next_group = bg->bg_next_group;
1042
1043         status = ocfs2_journal_dirty(handle, prev_bg_bh);
1044         if (status < 0) {
1045                 mlog_errno(status);
1046                 goto out_rollback;
1047         }
1048
1049         status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
1050                                       OCFS2_JOURNAL_ACCESS_WRITE);
1051         if (status < 0) {
1052                 mlog_errno(status);
1053                 goto out_rollback;
1054         }
1055
1056         bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1057
1058         status = ocfs2_journal_dirty(handle, bg_bh);
1059         if (status < 0) {
1060                 mlog_errno(status);
1061                 goto out_rollback;
1062         }
1063
1064         status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
1065                                       OCFS2_JOURNAL_ACCESS_WRITE);
1066         if (status < 0) {
1067                 mlog_errno(status);
1068                 goto out_rollback;
1069         }
1070
1071         fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1072
1073         status = ocfs2_journal_dirty(handle, fe_bh);
1074         if (status < 0) {
1075                 mlog_errno(status);
1076                 goto out_rollback;
1077         }
1078
1079         status = 0;
1080 out_rollback:
1081         if (status < 0) {
1082                 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
1083                 bg->bg_next_group = cpu_to_le64(bg_ptr);
1084                 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1085         }
1086
1087         mlog_exit(status);
1088         return status;
1089 }
1090
1091 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1092                                                      u32 wanted)
1093 {
1094         return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1095 }
1096
1097 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1098  * value on error. */
1099 static int ocfs2_cluster_group_search(struct inode *inode,
1100                                       struct buffer_head *group_bh,
1101                                       u32 bits_wanted, u32 min_bits,
1102                                       u64 max_block,
1103                                       u16 *bit_off, u16 *bits_found)
1104 {
1105         int search = -ENOSPC;
1106         int ret;
1107         u64 blkoff;
1108         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1109         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1110         u16 tmp_off, tmp_found;
1111         unsigned int max_bits, gd_cluster_off;
1112
1113         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1114
1115         if (gd->bg_free_bits_count) {
1116                 max_bits = le16_to_cpu(gd->bg_bits);
1117
1118                 /* Tail groups in cluster bitmaps which aren't cpg
1119                  * aligned are prone to partial extention by a failed
1120                  * fs resize. If the file system resize never got to
1121                  * update the dinode cluster count, then we don't want
1122                  * to trust any clusters past it, regardless of what
1123                  * the group descriptor says. */
1124                 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1125                                                           le64_to_cpu(gd->bg_blkno));
1126                 if ((gd_cluster_off + max_bits) >
1127                     OCFS2_I(inode)->ip_clusters) {
1128                         max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1129                         mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
1130                              (unsigned long long)le64_to_cpu(gd->bg_blkno),
1131                              le16_to_cpu(gd->bg_bits),
1132                              OCFS2_I(inode)->ip_clusters, max_bits);
1133                 }
1134
1135                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1136                                                         group_bh, bits_wanted,
1137                                                         max_bits,
1138                                                         &tmp_off, &tmp_found);
1139                 if (ret)
1140                         return ret;
1141
1142                 if (max_block) {
1143                         blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1144                                                           gd_cluster_off +
1145                                                           tmp_off + tmp_found);
1146                         mlog(0, "Checking %llu against %llu\n",
1147                              (unsigned long long)blkoff,
1148                              (unsigned long long)max_block);
1149                         if (blkoff > max_block)
1150                                 return -ENOSPC;
1151                 }
1152
1153                 /* ocfs2_block_group_find_clear_bits() might
1154                  * return success, but we still want to return
1155                  * -ENOSPC unless it found the minimum number
1156                  * of bits. */
1157                 if (min_bits <= tmp_found) {
1158                         *bit_off = tmp_off;
1159                         *bits_found = tmp_found;
1160                         search = 0; /* success */
1161                 } else if (tmp_found) {
1162                         /*
1163                          * Don't show bits which we'll be returning
1164                          * for allocation to the local alloc bitmap.
1165                          */
1166                         ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
1167                 }
1168         }
1169
1170         return search;
1171 }
1172
1173 static int ocfs2_block_group_search(struct inode *inode,
1174                                     struct buffer_head *group_bh,
1175                                     u32 bits_wanted, u32 min_bits,
1176                                     u64 max_block,
1177                                     u16 *bit_off, u16 *bits_found)
1178 {
1179         int ret = -ENOSPC;
1180         u64 blkoff;
1181         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1182
1183         BUG_ON(min_bits != 1);
1184         BUG_ON(ocfs2_is_cluster_bitmap(inode));
1185
1186         if (bg->bg_free_bits_count) {
1187                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1188                                                         group_bh, bits_wanted,
1189                                                         le16_to_cpu(bg->bg_bits),
1190                                                         bit_off, bits_found);
1191                 if (!ret && max_block) {
1192                         blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
1193                                 *bits_found;
1194                         mlog(0, "Checking %llu against %llu\n",
1195                              (unsigned long long)blkoff,
1196                              (unsigned long long)max_block);
1197                         if (blkoff > max_block)
1198                                 ret = -ENOSPC;
1199                 }
1200         }
1201
1202         return ret;
1203 }
1204
1205 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1206                                        handle_t *handle,
1207                                        struct buffer_head *di_bh,
1208                                        u32 num_bits,
1209                                        u16 chain)
1210 {
1211         int ret;
1212         u32 tmp_used;
1213         struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1214         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1215
1216         ret = ocfs2_journal_access(handle, inode, di_bh,
1217                                    OCFS2_JOURNAL_ACCESS_WRITE);
1218         if (ret < 0) {
1219                 mlog_errno(ret);
1220                 goto out;
1221         }
1222
1223         tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1224         di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1225         le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1226
1227         ret = ocfs2_journal_dirty(handle, di_bh);
1228         if (ret < 0)
1229                 mlog_errno(ret);
1230
1231 out:
1232         return ret;
1233 }
1234
1235 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1236                                   handle_t *handle,
1237                                   u32 bits_wanted,
1238                                   u32 min_bits,
1239                                   u16 *bit_off,
1240                                   unsigned int *num_bits,
1241                                   u64 gd_blkno,
1242                                   u16 *bits_left)
1243 {
1244         int ret;
1245         u16 found;
1246         struct buffer_head *group_bh = NULL;
1247         struct ocfs2_group_desc *gd;
1248         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1249         struct inode *alloc_inode = ac->ac_inode;
1250
1251         ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
1252                                           &group_bh);
1253         if (ret < 0) {
1254                 mlog_errno(ret);
1255                 return ret;
1256         }
1257
1258         gd = (struct ocfs2_group_desc *) group_bh->b_data;
1259         ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1260                                   ac->ac_max_block, bit_off, &found);
1261         if (ret < 0) {
1262                 if (ret != -ENOSPC)
1263                         mlog_errno(ret);
1264                 goto out;
1265         }
1266
1267         *num_bits = found;
1268
1269         ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1270                                                *num_bits,
1271                                                le16_to_cpu(gd->bg_chain));
1272         if (ret < 0) {
1273                 mlog_errno(ret);
1274                 goto out;
1275         }
1276
1277         ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1278                                          *bit_off, *num_bits);
1279         if (ret < 0)
1280                 mlog_errno(ret);
1281
1282         *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1283
1284 out:
1285         brelse(group_bh);
1286
1287         return ret;
1288 }
1289
1290 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1291                               handle_t *handle,
1292                               u32 bits_wanted,
1293                               u32 min_bits,
1294                               u16 *bit_off,
1295                               unsigned int *num_bits,
1296                               u64 *bg_blkno,
1297                               u16 *bits_left)
1298 {
1299         int status;
1300         u16 chain, tmp_bits;
1301         u32 tmp_used;
1302         u64 next_group;
1303         struct inode *alloc_inode = ac->ac_inode;
1304         struct buffer_head *group_bh = NULL;
1305         struct buffer_head *prev_group_bh = NULL;
1306         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1307         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1308         struct ocfs2_group_desc *bg;
1309
1310         chain = ac->ac_chain;
1311         mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1312              bits_wanted, chain,
1313              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1314
1315         status = ocfs2_read_group_descriptor(alloc_inode, fe,
1316                                              le64_to_cpu(cl->cl_recs[chain].c_blkno),
1317                                              &group_bh);
1318         if (status < 0) {
1319                 mlog_errno(status);
1320                 goto bail;
1321         }
1322         bg = (struct ocfs2_group_desc *) group_bh->b_data;
1323
1324         status = -ENOSPC;
1325         /* for now, the chain search is a bit simplistic. We just use
1326          * the 1st group with any empty bits. */
1327         while ((status = ac->ac_group_search(alloc_inode, group_bh,
1328                                              bits_wanted, min_bits,
1329                                              ac->ac_max_block, bit_off,
1330                                              &tmp_bits)) == -ENOSPC) {
1331                 if (!bg->bg_next_group)
1332                         break;
1333
1334                 brelse(prev_group_bh);
1335                 prev_group_bh = NULL;
1336
1337                 next_group = le64_to_cpu(bg->bg_next_group);
1338                 prev_group_bh = group_bh;
1339                 group_bh = NULL;
1340                 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1341                                                      next_group, &group_bh);
1342                 if (status < 0) {
1343                         mlog_errno(status);
1344                         goto bail;
1345                 }
1346                 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1347         }
1348         if (status < 0) {
1349                 if (status != -ENOSPC)
1350                         mlog_errno(status);
1351                 goto bail;
1352         }
1353
1354         mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1355              tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1356
1357         *num_bits = tmp_bits;
1358
1359         BUG_ON(*num_bits == 0);
1360
1361         /*
1362          * Keep track of previous block descriptor read. When
1363          * we find a target, if we have read more than X
1364          * number of descriptors, and the target is reasonably
1365          * empty, relink him to top of his chain.
1366          *
1367          * We've read 0 extra blocks and only send one more to
1368          * the transaction, yet the next guy to search has a
1369          * much easier time.
1370          *
1371          * Do this *after* figuring out how many bits we're taking out
1372          * of our target group.
1373          */
1374         if (ac->ac_allow_chain_relink &&
1375             (prev_group_bh) &&
1376             (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1377                 status = ocfs2_relink_block_group(handle, alloc_inode,
1378                                                   ac->ac_bh, group_bh,
1379                                                   prev_group_bh, chain);
1380                 if (status < 0) {
1381                         mlog_errno(status);
1382                         goto bail;
1383                 }
1384         }
1385
1386         /* Ok, claim our bits now: set the info on dinode, chainlist
1387          * and then the group */
1388         status = ocfs2_journal_access(handle,
1389                                       alloc_inode,
1390                                       ac->ac_bh,
1391                                       OCFS2_JOURNAL_ACCESS_WRITE);
1392         if (status < 0) {
1393                 mlog_errno(status);
1394                 goto bail;
1395         }
1396
1397         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1398         fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1399         le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1400
1401         status = ocfs2_journal_dirty(handle,
1402                                      ac->ac_bh);
1403         if (status < 0) {
1404                 mlog_errno(status);
1405                 goto bail;
1406         }
1407
1408         status = ocfs2_block_group_set_bits(handle,
1409                                             alloc_inode,
1410                                             bg,
1411                                             group_bh,
1412                                             *bit_off,
1413                                             *num_bits);
1414         if (status < 0) {
1415                 mlog_errno(status);
1416                 goto bail;
1417         }
1418
1419         mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
1420              (unsigned long long)le64_to_cpu(fe->i_blkno));
1421
1422         *bg_blkno = le64_to_cpu(bg->bg_blkno);
1423         *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1424 bail:
1425         brelse(group_bh);
1426         brelse(prev_group_bh);
1427
1428         mlog_exit(status);
1429         return status;
1430 }
1431
1432 /* will give out up to bits_wanted contiguous bits. */
1433 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1434                                      struct ocfs2_alloc_context *ac,
1435                                      handle_t *handle,
1436                                      u32 bits_wanted,
1437                                      u32 min_bits,
1438                                      u16 *bit_off,
1439                                      unsigned int *num_bits,
1440                                      u64 *bg_blkno)
1441 {
1442         int status;
1443         u16 victim, i;
1444         u16 bits_left = 0;
1445         u64 hint_blkno = ac->ac_last_group;
1446         struct ocfs2_chain_list *cl;
1447         struct ocfs2_dinode *fe;
1448
1449         mlog_entry_void();
1450
1451         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1452         BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1453         BUG_ON(!ac->ac_bh);
1454
1455         fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1456
1457         /* The bh was validated by the inode read during
1458          * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1459         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1460
1461         if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1462             le32_to_cpu(fe->id1.bitmap1.i_total)) {
1463                 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
1464                             "bits but only %u total.",
1465                             (unsigned long long)le64_to_cpu(fe->i_blkno),
1466                             le32_to_cpu(fe->id1.bitmap1.i_used),
1467                             le32_to_cpu(fe->id1.bitmap1.i_total));
1468                 status = -EIO;
1469                 goto bail;
1470         }
1471
1472         if (hint_blkno) {
1473                 /* Attempt to short-circuit the usual search mechanism
1474                  * by jumping straight to the most recently used
1475                  * allocation group. This helps us mantain some
1476                  * contiguousness across allocations. */
1477                 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1478                                                 min_bits, bit_off, num_bits,
1479                                                 hint_blkno, &bits_left);
1480                 if (!status) {
1481                         /* Be careful to update *bg_blkno here as the
1482                          * caller is expecting it to be filled in, and
1483                          * ocfs2_search_one_group() won't do that for
1484                          * us. */
1485                         *bg_blkno = hint_blkno;
1486                         goto set_hint;
1487                 }
1488                 if (status < 0 && status != -ENOSPC) {
1489                         mlog_errno(status);
1490                         goto bail;
1491                 }
1492         }
1493
1494         cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1495
1496         victim = ocfs2_find_victim_chain(cl);
1497         ac->ac_chain = victim;
1498         ac->ac_allow_chain_relink = 1;
1499
1500         status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
1501                                     num_bits, bg_blkno, &bits_left);
1502         if (!status)
1503                 goto set_hint;
1504         if (status < 0 && status != -ENOSPC) {
1505                 mlog_errno(status);
1506                 goto bail;
1507         }
1508
1509         mlog(0, "Search of victim chain %u came up with nothing, "
1510              "trying all chains now.\n", victim);
1511
1512         /* If we didn't pick a good victim, then just default to
1513          * searching each chain in order. Don't allow chain relinking
1514          * because we only calculate enough journal credits for one
1515          * relink per alloc. */
1516         ac->ac_allow_chain_relink = 0;
1517         for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1518                 if (i == victim)
1519                         continue;
1520                 if (!cl->cl_recs[i].c_free)
1521                         continue;
1522
1523                 ac->ac_chain = i;
1524                 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1525                                             bit_off, num_bits, bg_blkno,
1526                                             &bits_left);
1527                 if (!status)
1528                         break;
1529                 if (status < 0 && status != -ENOSPC) {
1530                         mlog_errno(status);
1531                         goto bail;
1532                 }
1533         }
1534
1535 set_hint:
1536         if (status != -ENOSPC) {
1537                 /* If the next search of this group is not likely to
1538                  * yield a suitable extent, then we reset the last
1539                  * group hint so as to not waste a disk read */
1540                 if (bits_left < min_bits)
1541                         ac->ac_last_group = 0;
1542                 else
1543                         ac->ac_last_group = *bg_blkno;
1544         }
1545
1546 bail:
1547         mlog_exit(status);
1548         return status;
1549 }
1550
1551 int ocfs2_claim_metadata(struct ocfs2_super *osb,
1552                          handle_t *handle,
1553                          struct ocfs2_alloc_context *ac,
1554                          u32 bits_wanted,
1555                          u16 *suballoc_bit_start,
1556                          unsigned int *num_bits,
1557                          u64 *blkno_start)
1558 {
1559         int status;
1560         u64 bg_blkno;
1561
1562         BUG_ON(!ac);
1563         BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1564         BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1565
1566         status = ocfs2_claim_suballoc_bits(osb,
1567                                            ac,
1568                                            handle,
1569                                            bits_wanted,
1570                                            1,
1571                                            suballoc_bit_start,
1572                                            num_bits,
1573                                            &bg_blkno);
1574         if (status < 0) {
1575                 mlog_errno(status);
1576                 goto bail;
1577         }
1578         atomic_inc(&osb->alloc_stats.bg_allocs);
1579
1580         *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1581         ac->ac_bits_given += (*num_bits);
1582         status = 0;
1583 bail:
1584         mlog_exit(status);
1585         return status;
1586 }
1587
1588 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1589                           handle_t *handle,
1590                           struct ocfs2_alloc_context *ac,
1591                           u16 *suballoc_bit,
1592                           u64 *fe_blkno)
1593 {
1594         int status;
1595         unsigned int num_bits;
1596         u64 bg_blkno;
1597
1598         mlog_entry_void();
1599
1600         BUG_ON(!ac);
1601         BUG_ON(ac->ac_bits_given != 0);
1602         BUG_ON(ac->ac_bits_wanted != 1);
1603         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1604
1605         status = ocfs2_claim_suballoc_bits(osb,
1606                                            ac,
1607                                            handle,
1608                                            1,
1609                                            1,
1610                                            suballoc_bit,
1611                                            &num_bits,
1612                                            &bg_blkno);
1613         if (status < 0) {
1614                 mlog_errno(status);
1615                 goto bail;
1616         }
1617         atomic_inc(&osb->alloc_stats.bg_allocs);
1618
1619         BUG_ON(num_bits != 1);
1620
1621         *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1622         ac->ac_bits_given++;
1623         status = 0;
1624 bail:
1625         mlog_exit(status);
1626         return status;
1627 }
1628
1629 /* translate a group desc. blkno and it's bitmap offset into
1630  * disk cluster offset. */
1631 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1632                                                    u64 bg_blkno,
1633                                                    u16 bg_bit_off)
1634 {
1635         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1636         u32 cluster = 0;
1637
1638         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1639
1640         if (bg_blkno != osb->first_cluster_group_blkno)
1641                 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1642         cluster += (u32) bg_bit_off;
1643         return cluster;
1644 }
1645
1646 /* given a cluster offset, calculate which block group it belongs to
1647  * and return that block offset. */
1648 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
1649 {
1650         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1651         u32 group_no;
1652
1653         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1654
1655         group_no = cluster / osb->bitmap_cpg;
1656         if (!group_no)
1657                 return osb->first_cluster_group_blkno;
1658         return ocfs2_clusters_to_blocks(inode->i_sb,
1659                                         group_no * osb->bitmap_cpg);
1660 }
1661
1662 /* given the block number of a cluster start, calculate which cluster
1663  * group and descriptor bitmap offset that corresponds to. */
1664 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1665                                                 u64 data_blkno,
1666                                                 u64 *bg_blkno,
1667                                                 u16 *bg_bit_off)
1668 {
1669         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1670         u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1671
1672         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1673
1674         *bg_blkno = ocfs2_which_cluster_group(inode,
1675                                               data_cluster);
1676
1677         if (*bg_blkno == osb->first_cluster_group_blkno)
1678                 *bg_bit_off = (u16) data_cluster;
1679         else
1680                 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1681                                                              data_blkno - *bg_blkno);
1682 }
1683
1684 /*
1685  * min_bits - minimum contiguous chunk from this total allocation we
1686  * can handle. set to what we asked for originally for a full
1687  * contig. allocation, set to '1' to indicate we can deal with extents
1688  * of any size.
1689  */
1690 int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1691                            handle_t *handle,
1692                            struct ocfs2_alloc_context *ac,
1693                            u32 min_clusters,
1694                            u32 max_clusters,
1695                            u32 *cluster_start,
1696                            u32 *num_clusters)
1697 {
1698         int status;
1699         unsigned int bits_wanted = max_clusters;
1700         u64 bg_blkno = 0;
1701         u16 bg_bit_off;
1702
1703         mlog_entry_void();
1704
1705         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1706
1707         BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1708                && ac->ac_which != OCFS2_AC_USE_MAIN);
1709
1710         if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1711                 status = ocfs2_claim_local_alloc_bits(osb,
1712                                                       handle,
1713                                                       ac,
1714                                                       bits_wanted,
1715                                                       cluster_start,
1716                                                       num_clusters);
1717                 if (!status)
1718                         atomic_inc(&osb->alloc_stats.local_data);
1719         } else {
1720                 if (min_clusters > (osb->bitmap_cpg - 1)) {
1721                         /* The only paths asking for contiguousness
1722                          * should know about this already. */
1723                         mlog(ML_ERROR, "minimum allocation requested %u exceeds "
1724                              "group bitmap size %u!\n", min_clusters,
1725                              osb->bitmap_cpg);
1726                         status = -ENOSPC;
1727                         goto bail;
1728                 }
1729                 /* clamp the current request down to a realistic size. */
1730                 if (bits_wanted > (osb->bitmap_cpg - 1))
1731                         bits_wanted = osb->bitmap_cpg - 1;
1732
1733                 status = ocfs2_claim_suballoc_bits(osb,
1734                                                    ac,
1735                                                    handle,
1736                                                    bits_wanted,
1737                                                    min_clusters,
1738                                                    &bg_bit_off,
1739                                                    num_clusters,
1740                                                    &bg_blkno);
1741                 if (!status) {
1742                         *cluster_start =
1743                                 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1744                                                                  bg_blkno,
1745                                                                  bg_bit_off);
1746                         atomic_inc(&osb->alloc_stats.bitmap_data);
1747                 }
1748         }
1749         if (status < 0) {
1750                 if (status != -ENOSPC)
1751                         mlog_errno(status);
1752                 goto bail;
1753         }
1754
1755         ac->ac_bits_given += *num_clusters;
1756
1757 bail:
1758         mlog_exit(status);
1759         return status;
1760 }
1761
1762 int ocfs2_claim_clusters(struct ocfs2_super *osb,
1763                          handle_t *handle,
1764                          struct ocfs2_alloc_context *ac,
1765                          u32 min_clusters,
1766                          u32 *cluster_start,
1767                          u32 *num_clusters)
1768 {
1769         unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1770
1771         return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
1772                                       bits_wanted, cluster_start, num_clusters);
1773 }
1774
1775 static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1776                                                struct inode *alloc_inode,
1777                                                struct ocfs2_group_desc *bg,
1778                                                struct buffer_head *group_bh,
1779                                                unsigned int bit_off,
1780                                                unsigned int num_bits)
1781 {
1782         int status;
1783         unsigned int tmp;
1784         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1785         struct ocfs2_group_desc *undo_bg = NULL;
1786
1787         mlog_entry_void();
1788
1789         /* The caller got this descriptor from
1790          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1791         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1792
1793         mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1794
1795         if (ocfs2_is_cluster_bitmap(alloc_inode))
1796                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1797
1798         status = ocfs2_journal_access(handle, alloc_inode, group_bh,
1799                                       journal_type);
1800         if (status < 0) {
1801                 mlog_errno(status);
1802                 goto bail;
1803         }
1804
1805         if (ocfs2_is_cluster_bitmap(alloc_inode))
1806                 undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
1807
1808         tmp = num_bits;
1809         while(tmp--) {
1810                 ocfs2_clear_bit((bit_off + tmp),
1811                                 (unsigned long *) bg->bg_bitmap);
1812                 if (ocfs2_is_cluster_bitmap(alloc_inode))
1813                         ocfs2_set_bit(bit_off + tmp,
1814                                       (unsigned long *) undo_bg->bg_bitmap);
1815         }
1816         le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1817
1818         status = ocfs2_journal_dirty(handle, group_bh);
1819         if (status < 0)
1820                 mlog_errno(status);
1821 bail:
1822         return status;
1823 }
1824
1825 /*
1826  * expects the suballoc inode to already be locked.
1827  */
1828 int ocfs2_free_suballoc_bits(handle_t *handle,
1829                              struct inode *alloc_inode,
1830                              struct buffer_head *alloc_bh,
1831                              unsigned int start_bit,
1832                              u64 bg_blkno,
1833                              unsigned int count)
1834 {
1835         int status = 0;
1836         u32 tmp_used;
1837         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1838         struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1839         struct buffer_head *group_bh = NULL;
1840         struct ocfs2_group_desc *group;
1841
1842         mlog_entry_void();
1843
1844         /* The alloc_bh comes from ocfs2_free_dinode() or
1845          * ocfs2_free_clusters().  The callers have all locked the
1846          * allocator and gotten alloc_bh from the lock call.  This
1847          * validates the dinode buffer.  Any corruption that has happended
1848          * is a code bug. */
1849         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1850         BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1851
1852         mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
1853              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
1854              (unsigned long long)bg_blkno, start_bit);
1855
1856         status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
1857                                              &group_bh);
1858         if (status < 0) {
1859                 mlog_errno(status);
1860                 goto bail;
1861         }
1862         group = (struct ocfs2_group_desc *) group_bh->b_data;
1863
1864         BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1865
1866         status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1867                                               group, group_bh,
1868                                               start_bit, count);
1869         if (status < 0) {
1870                 mlog_errno(status);
1871                 goto bail;
1872         }
1873
1874         status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
1875                                       OCFS2_JOURNAL_ACCESS_WRITE);
1876         if (status < 0) {
1877                 mlog_errno(status);
1878                 goto bail;
1879         }
1880
1881         le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
1882                      count);
1883         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1884         fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
1885
1886         status = ocfs2_journal_dirty(handle, alloc_bh);
1887         if (status < 0) {
1888                 mlog_errno(status);
1889                 goto bail;
1890         }
1891
1892 bail:
1893         brelse(group_bh);
1894
1895         mlog_exit(status);
1896         return status;
1897 }
1898
1899 int ocfs2_free_dinode(handle_t *handle,
1900                       struct inode *inode_alloc_inode,
1901                       struct buffer_head *inode_alloc_bh,
1902                       struct ocfs2_dinode *di)
1903 {
1904         u64 blk = le64_to_cpu(di->i_blkno);
1905         u16 bit = le16_to_cpu(di->i_suballoc_bit);
1906         u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1907
1908         return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
1909                                         inode_alloc_bh, bit, bg_blkno, 1);
1910 }
1911
1912 int ocfs2_free_clusters(handle_t *handle,
1913                        struct inode *bitmap_inode,
1914                        struct buffer_head *bitmap_bh,
1915                        u64 start_blk,
1916                        unsigned int num_clusters)
1917 {
1918         int status;
1919         u16 bg_start_bit;
1920         u64 bg_blkno;
1921         struct ocfs2_dinode *fe;
1922
1923         /* You can't ever have a contiguous set of clusters
1924          * bigger than a block group bitmap so we never have to worry
1925          * about looping on them. */
1926
1927         mlog_entry_void();
1928
1929         /* This is expensive. We can safely remove once this stuff has
1930          * gotten tested really well. */
1931         BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
1932
1933         fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
1934
1935         ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
1936                                      &bg_start_bit);
1937
1938         mlog(0, "want to free %u clusters starting at block %llu\n",
1939              num_clusters, (unsigned long long)start_blk);
1940         mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
1941              (unsigned long long)bg_blkno, bg_start_bit);
1942
1943         status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
1944                                           bg_start_bit, bg_blkno,
1945                                           num_clusters);
1946         if (status < 0) {
1947                 mlog_errno(status);
1948                 goto out;
1949         }
1950
1951         ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
1952                                          num_clusters);
1953
1954 out:
1955         mlog_exit(status);
1956         return status;
1957 }
1958
1959 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
1960 {
1961         printk("Block Group:\n");
1962         printk("bg_signature:       %s\n", bg->bg_signature);
1963         printk("bg_size:            %u\n", bg->bg_size);
1964         printk("bg_bits:            %u\n", bg->bg_bits);
1965         printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
1966         printk("bg_chain:           %u\n", bg->bg_chain);
1967         printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
1968         printk("bg_next_group:      %llu\n",
1969                (unsigned long long)bg->bg_next_group);
1970         printk("bg_parent_dinode:   %llu\n",
1971                (unsigned long long)bg->bg_parent_dinode);
1972         printk("bg_blkno:           %llu\n",
1973                (unsigned long long)bg->bg_blkno);
1974 }
1975
1976 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
1977 {
1978         int i;
1979
1980         printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
1981         printk("i_signature:                  %s\n", fe->i_signature);
1982         printk("i_size:                       %llu\n",
1983                (unsigned long long)fe->i_size);
1984         printk("i_clusters:                   %u\n", fe->i_clusters);
1985         printk("i_generation:                 %u\n",
1986                le32_to_cpu(fe->i_generation));
1987         printk("id1.bitmap1.i_used:           %u\n",
1988                le32_to_cpu(fe->id1.bitmap1.i_used));
1989         printk("id1.bitmap1.i_total:          %u\n",
1990                le32_to_cpu(fe->id1.bitmap1.i_total));
1991         printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
1992         printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
1993         printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
1994         printk("id2.i_chain.cl_next_free_rec: %u\n",
1995                fe->id2.i_chain.cl_next_free_rec);
1996         for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
1997                 printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
1998                        fe->id2.i_chain.cl_recs[i].c_free);
1999                 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
2000                        fe->id2.i_chain.cl_recs[i].c_total);
2001                 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
2002                        (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
2003         }
2004 }
2005
2006 /*
2007  * For a given allocation, determine which allocators will need to be
2008  * accessed, and lock them, reserving the appropriate number of bits.
2009  *
2010  * Sparse file systems call this from ocfs2_write_begin_nolock()
2011  * and ocfs2_allocate_unwritten_extents().
2012  *
2013  * File systems which don't support holes call this from
2014  * ocfs2_extend_allocation().
2015  */
2016 int ocfs2_lock_allocators(struct inode *inode,
2017                           struct ocfs2_extent_tree *et,
2018                           u32 clusters_to_add, u32 extents_to_split,
2019                           struct ocfs2_alloc_context **data_ac,
2020                           struct ocfs2_alloc_context **meta_ac)
2021 {
2022         int ret = 0, num_free_extents;
2023         unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2024         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2025
2026         *meta_ac = NULL;
2027         if (data_ac)
2028                 *data_ac = NULL;
2029
2030         BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2031
2032         num_free_extents = ocfs2_num_free_extents(osb, inode, et);
2033         if (num_free_extents < 0) {
2034                 ret = num_free_extents;
2035                 mlog_errno(ret);
2036                 goto out;
2037         }
2038
2039         /*
2040          * Sparse allocation file systems need to be more conservative
2041          * with reserving room for expansion - the actual allocation
2042          * happens while we've got a journal handle open so re-taking
2043          * a cluster lock (because we ran out of room for another
2044          * extent) will violate ordering rules.
2045          *
2046          * Most of the time we'll only be seeing this 1 cluster at a time
2047          * anyway.
2048          *
2049          * Always lock for any unwritten extents - we might want to
2050          * add blocks during a split.
2051          */
2052         if (!num_free_extents ||
2053             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2054                 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2055                 if (ret < 0) {
2056                         if (ret != -ENOSPC)
2057                                 mlog_errno(ret);
2058                         goto out;
2059                 }
2060         }
2061
2062         if (clusters_to_add == 0)
2063                 goto out;
2064
2065         ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2066         if (ret < 0) {
2067                 if (ret != -ENOSPC)
2068                         mlog_errno(ret);
2069                 goto out;
2070         }
2071
2072 out:
2073         if (ret) {
2074                 if (*meta_ac) {
2075                         ocfs2_free_alloc_context(*meta_ac);
2076                         *meta_ac = NULL;
2077                 }
2078
2079                 /*
2080                  * We cannot have an error and a non null *data_ac.
2081                  */
2082         }
2083
2084         return ret;
2085 }