ocfs2: Set suballoc_loc on allocated metadata.
[safe/jmp/linux-2.6] / fs / ocfs2 / suballoc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "blockcheck.h"
39 #include "dlmglue.h"
40 #include "inode.h"
41 #include "journal.h"
42 #include "localalloc.h"
43 #include "suballoc.h"
44 #include "super.h"
45 #include "sysfile.h"
46 #include "uptodate.h"
47
48 #include "buffer_head_io.h"
49
50 #define NOT_ALLOC_NEW_GROUP             0
51 #define ALLOC_NEW_GROUP                 0x1
52 #define ALLOC_GROUPS_FROM_GLOBAL        0x2
53
54 #define OCFS2_MAX_TO_STEAL              1024
55
56 struct ocfs2_suballoc_result {
57         u64             sr_bg_blkno;    /* The bg we allocated from.  Set
58                                            to 0 when a block group is
59                                            contiguous. */
60         u64             sr_blkno;       /* The first allocated block */
61         unsigned int    sr_bit_offset;  /* The bit in the bg */
62         unsigned int    sr_bits;        /* How many bits we claimed */
63 };
64
65 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
66 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
67 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
68 static int ocfs2_block_group_fill(handle_t *handle,
69                                   struct inode *alloc_inode,
70                                   struct buffer_head *bg_bh,
71                                   u64 group_blkno,
72                                   unsigned int group_clusters,
73                                   u16 my_chain,
74                                   struct ocfs2_chain_list *cl);
75 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
76                                    struct inode *alloc_inode,
77                                    struct buffer_head *bh,
78                                    u64 max_block,
79                                    u64 *last_alloc_group,
80                                    int flags);
81
82 static int ocfs2_cluster_group_search(struct inode *inode,
83                                       struct buffer_head *group_bh,
84                                       u32 bits_wanted, u32 min_bits,
85                                       u64 max_block,
86                                       struct ocfs2_suballoc_result *res);
87 static int ocfs2_block_group_search(struct inode *inode,
88                                     struct buffer_head *group_bh,
89                                     u32 bits_wanted, u32 min_bits,
90                                     u64 max_block,
91                                     struct ocfs2_suballoc_result *res);
92 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
93                                      handle_t *handle,
94                                      u32 bits_wanted,
95                                      u32 min_bits,
96                                      struct ocfs2_suballoc_result *res);
97 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
98                                          int nr);
99 static inline int ocfs2_block_group_set_bits(handle_t *handle,
100                                              struct inode *alloc_inode,
101                                              struct ocfs2_group_desc *bg,
102                                              struct buffer_head *group_bh,
103                                              unsigned int bit_off,
104                                              unsigned int num_bits);
105 static int ocfs2_relink_block_group(handle_t *handle,
106                                     struct inode *alloc_inode,
107                                     struct buffer_head *fe_bh,
108                                     struct buffer_head *bg_bh,
109                                     struct buffer_head *prev_bg_bh,
110                                     u16 chain);
111 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
112                                                      u32 wanted);
113 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
114                                                    u64 bg_blkno,
115                                                    u16 bg_bit_off);
116 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
117                                                 u64 data_blkno,
118                                                 u64 *bg_blkno,
119                                                 u16 *bg_bit_off);
120 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
121                                              u32 bits_wanted, u64 max_block,
122                                              int flags,
123                                              struct ocfs2_alloc_context **ac);
124
125 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
126 {
127         struct inode *inode = ac->ac_inode;
128
129         if (inode) {
130                 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
131                         ocfs2_inode_unlock(inode, 1);
132
133                 mutex_unlock(&inode->i_mutex);
134
135                 iput(inode);
136                 ac->ac_inode = NULL;
137         }
138         brelse(ac->ac_bh);
139         ac->ac_bh = NULL;
140         ac->ac_resv = NULL;
141 }
142
143 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
144 {
145         ocfs2_free_ac_resource(ac);
146         kfree(ac);
147 }
148
149 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
150 {
151         return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
152 }
153
154 #define do_error(fmt, ...)                                              \
155         do{                                                             \
156                 if (resize)                                     \
157                         mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
158                 else                                                    \
159                         ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
160         } while (0)
161
162 static int ocfs2_validate_gd_self(struct super_block *sb,
163                                   struct buffer_head *bh,
164                                   int resize)
165 {
166         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
167
168         if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
169                 do_error("Group descriptor #%llu has bad signature %.*s",
170                          (unsigned long long)bh->b_blocknr, 7,
171                          gd->bg_signature);
172                 return -EINVAL;
173         }
174
175         if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
176                 do_error("Group descriptor #%llu has an invalid bg_blkno "
177                          "of %llu",
178                          (unsigned long long)bh->b_blocknr,
179                          (unsigned long long)le64_to_cpu(gd->bg_blkno));
180                 return -EINVAL;
181         }
182
183         if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
184                 do_error("Group descriptor #%llu has an invalid "
185                          "fs_generation of #%u",
186                          (unsigned long long)bh->b_blocknr,
187                          le32_to_cpu(gd->bg_generation));
188                 return -EINVAL;
189         }
190
191         if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
192                 do_error("Group descriptor #%llu has bit count %u but "
193                          "claims that %u are free",
194                          (unsigned long long)bh->b_blocknr,
195                          le16_to_cpu(gd->bg_bits),
196                          le16_to_cpu(gd->bg_free_bits_count));
197                 return -EINVAL;
198         }
199
200         if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
201                 do_error("Group descriptor #%llu has bit count %u but "
202                          "max bitmap bits of %u",
203                          (unsigned long long)bh->b_blocknr,
204                          le16_to_cpu(gd->bg_bits),
205                          8 * le16_to_cpu(gd->bg_size));
206                 return -EINVAL;
207         }
208
209         return 0;
210 }
211
212 static int ocfs2_validate_gd_parent(struct super_block *sb,
213                                     struct ocfs2_dinode *di,
214                                     struct buffer_head *bh,
215                                     int resize)
216 {
217         unsigned int max_bits;
218         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
219
220         if (di->i_blkno != gd->bg_parent_dinode) {
221                 do_error("Group descriptor #%llu has bad parent "
222                          "pointer (%llu, expected %llu)",
223                          (unsigned long long)bh->b_blocknr,
224                          (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
225                          (unsigned long long)le64_to_cpu(di->i_blkno));
226                 return -EINVAL;
227         }
228
229         max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
230         if (le16_to_cpu(gd->bg_bits) > max_bits) {
231                 do_error("Group descriptor #%llu has bit count of %u",
232                          (unsigned long long)bh->b_blocknr,
233                          le16_to_cpu(gd->bg_bits));
234                 return -EINVAL;
235         }
236
237         /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
238         if ((le16_to_cpu(gd->bg_chain) >
239              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
240             ((le16_to_cpu(gd->bg_chain) ==
241              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
242                 do_error("Group descriptor #%llu has bad chain %u",
243                          (unsigned long long)bh->b_blocknr,
244                          le16_to_cpu(gd->bg_chain));
245                 return -EINVAL;
246         }
247
248         return 0;
249 }
250
251 #undef do_error
252
253 /*
254  * This version only prints errors.  It does not fail the filesystem, and
255  * exists only for resize.
256  */
257 int ocfs2_check_group_descriptor(struct super_block *sb,
258                                  struct ocfs2_dinode *di,
259                                  struct buffer_head *bh)
260 {
261         int rc;
262         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
263
264         BUG_ON(!buffer_uptodate(bh));
265
266         /*
267          * If the ecc fails, we return the error but otherwise
268          * leave the filesystem running.  We know any error is
269          * local to this block.
270          */
271         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
272         if (rc) {
273                 mlog(ML_ERROR,
274                      "Checksum failed for group descriptor %llu\n",
275                      (unsigned long long)bh->b_blocknr);
276         } else
277                 rc = ocfs2_validate_gd_self(sb, bh, 1);
278         if (!rc)
279                 rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
280
281         return rc;
282 }
283
284 static int ocfs2_validate_group_descriptor(struct super_block *sb,
285                                            struct buffer_head *bh)
286 {
287         int rc;
288         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
289
290         mlog(0, "Validating group descriptor %llu\n",
291              (unsigned long long)bh->b_blocknr);
292
293         BUG_ON(!buffer_uptodate(bh));
294
295         /*
296          * If the ecc fails, we return the error but otherwise
297          * leave the filesystem running.  We know any error is
298          * local to this block.
299          */
300         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
301         if (rc)
302                 return rc;
303
304         /*
305          * Errors after here are fatal.
306          */
307
308         return ocfs2_validate_gd_self(sb, bh, 0);
309 }
310
311 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
312                                 u64 gd_blkno, struct buffer_head **bh)
313 {
314         int rc;
315         struct buffer_head *tmp = *bh;
316
317         rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
318                               ocfs2_validate_group_descriptor);
319         if (rc)
320                 goto out;
321
322         rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
323         if (rc) {
324                 brelse(tmp);
325                 goto out;
326         }
327
328         /* If ocfs2_read_block() got us a new bh, pass it up. */
329         if (!*bh)
330                 *bh = tmp;
331
332 out:
333         return rc;
334 }
335
336 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
337                                           struct ocfs2_group_desc *bg,
338                                           struct ocfs2_chain_list *cl,
339                                           u64 p_blkno, u32 clusters)
340 {
341         struct ocfs2_extent_list *el = &bg->bg_list;
342         struct ocfs2_extent_rec *rec;
343
344         BUG_ON(!ocfs2_supports_discontig_bh(osb));
345         if (!el->l_next_free_rec)
346                 el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
347         rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
348         rec->e_blkno = p_blkno;
349         rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
350                                   le16_to_cpu(cl->cl_bpc));
351         rec->e_leaf_clusters = cpu_to_le32(clusters);
352         le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
353         le16_add_cpu(&el->l_next_free_rec, 1);
354 }
355
356 static int ocfs2_block_group_fill(handle_t *handle,
357                                   struct inode *alloc_inode,
358                                   struct buffer_head *bg_bh,
359                                   u64 group_blkno,
360                                   unsigned int group_clusters,
361                                   u16 my_chain,
362                                   struct ocfs2_chain_list *cl)
363 {
364         int status = 0;
365         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
366         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
367         struct super_block * sb = alloc_inode->i_sb;
368
369         mlog_entry_void();
370
371         if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
372                 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
373                             "b_blocknr (%llu)",
374                             (unsigned long long)group_blkno,
375                             (unsigned long long) bg_bh->b_blocknr);
376                 status = -EIO;
377                 goto bail;
378         }
379
380         status = ocfs2_journal_access_gd(handle,
381                                          INODE_CACHE(alloc_inode),
382                                          bg_bh,
383                                          OCFS2_JOURNAL_ACCESS_CREATE);
384         if (status < 0) {
385                 mlog_errno(status);
386                 goto bail;
387         }
388
389         memset(bg, 0, sb->s_blocksize);
390         strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
391         bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
392         bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1));
393         bg->bg_chain = cpu_to_le16(my_chain);
394         bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
395         bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
396         bg->bg_blkno = cpu_to_le64(group_blkno);
397         if (group_clusters == le16_to_cpu(cl->cl_cpg))
398                 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
399         else
400                 ocfs2_bg_discontig_add_extent(osb, bg, cl, bg->bg_blkno,
401                                               group_clusters);
402
403         /* set the 1st bit in the bitmap to account for the descriptor block */
404         ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
405         bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
406
407         ocfs2_journal_dirty(handle, bg_bh);
408
409         /* There is no need to zero out or otherwise initialize the
410          * other blocks in a group - All valid FS metadata in a block
411          * group stores the superblock fs_generation value at
412          * allocation time. */
413
414 bail:
415         mlog_exit(status);
416         return status;
417 }
418
419 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
420 {
421         u16 curr, best;
422
423         best = curr = 0;
424         while (curr < le16_to_cpu(cl->cl_count)) {
425                 if (le32_to_cpu(cl->cl_recs[best].c_total) >
426                     le32_to_cpu(cl->cl_recs[curr].c_total))
427                         best = curr;
428                 curr++;
429         }
430         return best;
431 }
432
433 static struct buffer_head *
434 ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
435                                struct inode *alloc_inode,
436                                struct ocfs2_alloc_context *ac,
437                                struct ocfs2_chain_list *cl)
438 {
439         int status;
440         u32 bit_off, num_bits;
441         u64 bg_blkno;
442         struct buffer_head *bg_bh;
443         unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
444
445         status = ocfs2_claim_clusters(handle, ac,
446                                       le16_to_cpu(cl->cl_cpg), &bit_off,
447                                       &num_bits);
448         if (status < 0) {
449                 if (status != -ENOSPC)
450                         mlog_errno(status);
451                 goto bail;
452         }
453
454         /* setup the group */
455         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
456         mlog(0, "new descriptor, record %u, at block %llu\n",
457              alloc_rec, (unsigned long long)bg_blkno);
458
459         bg_bh = sb_getblk(osb->sb, bg_blkno);
460         if (!bg_bh) {
461                 status = -EIO;
462                 mlog_errno(status);
463                 goto bail;
464         }
465         ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
466
467         status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
468                                         bg_blkno, num_bits, alloc_rec, cl);
469         if (status < 0) {
470                 brelse(bg_bh);
471                 mlog_errno(status);
472         }
473
474 bail:
475         return status ? ERR_PTR(status) : bg_bh;
476 }
477
478 static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
479                                         handle_t *handle,
480                                         struct ocfs2_alloc_context *ac,
481                                         unsigned int min_bits,
482                                         u32 *bit_off, u32 *num_bits)
483 {
484         int status;
485
486         while (min_bits) {
487                 status = ocfs2_claim_clusters(handle, ac, min_bits,
488                                               bit_off, num_bits);
489                 if (status != -ENOSPC)
490                         break;
491
492                 min_bits >>= 1;
493         }
494
495         return status;
496 }
497
498 static int ocfs2_block_group_grow_discontig(handle_t *handle,
499                                             struct inode *alloc_inode,
500                                             struct buffer_head *bg_bh,
501                                             struct ocfs2_alloc_context *ac,
502                                             struct ocfs2_chain_list *cl,
503                                             unsigned int min_bits)
504 {
505         int status;
506         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
507         struct ocfs2_group_desc *bg =
508                 (struct ocfs2_group_desc *)bg_bh->b_data;
509         unsigned int needed =
510                 ocfs2_bits_per_group(cl) - le16_to_cpu(bg->bg_bits);
511         u32 p_cpos, clusters;
512         u64 p_blkno;
513         struct ocfs2_extent_list *el = &bg->bg_list;
514
515         status = ocfs2_journal_access_gd(handle,
516                                          INODE_CACHE(alloc_inode),
517                                          bg_bh,
518                                          OCFS2_JOURNAL_ACCESS_CREATE);
519         if (status < 0) {
520                 mlog_errno(status);
521                 goto bail;
522         }
523
524         while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
525                                 le16_to_cpu(el->l_count))) {
526                 status = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_ALLOC);
527                 if (status) {
528                         mlog_errno(status);
529                         goto bail;
530                 }
531
532                 if (min_bits > needed)
533                         min_bits = needed;
534                 status = ocfs2_block_group_claim_bits(osb, handle, ac,
535                                                       min_bits, &p_cpos,
536                                                       &clusters);
537                 if (status < 0) {
538                         if (status != -ENOSPC)
539                                 mlog_errno(status);
540                         goto bail;
541                 }
542                 p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
543                 ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
544                                               clusters);
545
546                 min_bits = clusters;
547                 needed = ocfs2_bits_per_group(cl) - le16_to_cpu(bg->bg_bits);
548         }
549
550         if (needed > 0) {
551         }
552
553         ocfs2_journal_dirty(handle, bg_bh);
554
555 bail:
556         return status;
557 }
558
559 static void ocfs2_bg_alloc_cleanup(struct inode *alloc_inode,
560                                    struct buffer_head *bg_bh,
561                                    struct ocfs2_cached_dealloc_ctxt *dealloc)
562 {
563         int i;
564         struct ocfs2_group_desc *bg;
565         struct ocfs2_extent_list *el;
566         struct ocfs2_extent_rec *rec;
567
568         if (!bg_bh)
569                 return;
570
571         bg = (struct ocfs2_group_desc *)bg_bh->b_data;
572         el = &bg->bg_list;
573         for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
574                 rec = &el->l_recs[i];
575                 ocfs2_cache_cluster_dealloc(dealloc,
576                                             le64_to_cpu(rec->e_blkno),
577                                             le32_to_cpu(rec->e_leaf_clusters));
578         }
579
580         ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
581         brelse(bg_bh);
582 }
583
584 static struct buffer_head *
585 ocfs2_block_group_alloc_discontig(handle_t *handle,
586                                   struct inode *alloc_inode,
587                                   struct ocfs2_alloc_context *ac,
588                                   struct ocfs2_chain_list *cl,
589                                   struct ocfs2_cached_dealloc_ctxt *dealloc)
590 {
591         int status;
592         u32 bit_off, num_bits;
593         u64 bg_blkno;
594         unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
595         struct buffer_head *bg_bh = NULL;
596         unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
597         struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
598
599         if (!ocfs2_supports_discontig_bh(osb)) {
600                 status = -ENOSPC;
601                 goto bail;
602         }
603
604         /* Claim the first region */
605         status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
606                                               &bit_off, &num_bits);
607         if (status < 0) {
608                 if (status != -ENOSPC)
609                         mlog_errno(status);
610                 goto bail;
611         }
612         min_bits = num_bits;
613
614         /* setup the group */
615         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
616         mlog(0, "new descriptor, record %u, at block %llu\n",
617              alloc_rec, (unsigned long long)bg_blkno);
618
619         bg_bh = sb_getblk(osb->sb, bg_blkno);
620         if (!bg_bh) {
621                 status = -EIO;
622                 mlog_errno(status);
623                 goto bail;
624         }
625         ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
626
627         status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
628                                         bg_blkno, num_bits, alloc_rec, cl);
629         if (status < 0) {
630                 mlog_errno(status);
631                 goto bail;
632         }
633
634         status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
635                                                   bg_bh, ac, cl, min_bits);
636         if (status)
637                 mlog_errno(status);
638
639 bail:
640         if (status)
641                 ocfs2_bg_alloc_cleanup(alloc_inode, bg_bh, dealloc);
642         return status ? ERR_PTR(status) : bg_bh;
643 }
644
645 /*
646  * We expect the block group allocator to already be locked.
647  */
648 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
649                                    struct inode *alloc_inode,
650                                    struct buffer_head *bh,
651                                    u64 max_block,
652                                    u64 *last_alloc_group,
653                                    int flags)
654 {
655         int status, credits;
656         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
657         struct ocfs2_chain_list *cl;
658         struct ocfs2_alloc_context *ac = NULL;
659         handle_t *handle = NULL;
660         u64 bg_blkno;
661         struct buffer_head *bg_bh = NULL;
662         struct ocfs2_group_desc *bg;
663         struct ocfs2_cached_dealloc_ctxt dealloc;
664
665         BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
666
667         mlog_entry_void();
668
669         ocfs2_init_dealloc_ctxt(&dealloc);
670
671         cl = &fe->id2.i_chain;
672         status = ocfs2_reserve_clusters_with_limit(osb,
673                                                    le16_to_cpu(cl->cl_cpg),
674                                                    max_block, flags, &ac);
675         if (status < 0) {
676                 if (status != -ENOSPC)
677                         mlog_errno(status);
678                 goto bail;
679         }
680
681         credits = ocfs2_calc_group_alloc_credits(osb->sb,
682                                                  le16_to_cpu(cl->cl_cpg));
683         handle = ocfs2_start_trans(osb, credits);
684         if (IS_ERR(handle)) {
685                 status = PTR_ERR(handle);
686                 handle = NULL;
687                 mlog_errno(status);
688                 goto bail;
689         }
690
691         if (last_alloc_group && *last_alloc_group != 0) {
692                 mlog(0, "use old allocation group %llu for block group alloc\n",
693                      (unsigned long long)*last_alloc_group);
694                 ac->ac_last_group = *last_alloc_group;
695         }
696
697         bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
698                                                ac, cl);
699         if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
700                 bg_bh = ocfs2_block_group_alloc_discontig(handle,
701                                                           alloc_inode,
702                                                           ac, cl,
703                                                           &dealloc);
704         if (IS_ERR(bg_bh)) {
705                 status = PTR_ERR(bg_bh);
706                 bg_bh = NULL;
707                 if (status != -ENOSPC)
708                         mlog_errno(status);
709                 goto bail;
710         }
711         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
712
713         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
714                                          bh, OCFS2_JOURNAL_ACCESS_WRITE);
715         if (status < 0) {
716                 mlog_errno(status);
717                 goto bail;
718         }
719
720         le32_add_cpu(&cl->cl_recs[bg->bg_chain].c_free,
721                      le16_to_cpu(bg->bg_free_bits_count));
722         le32_add_cpu(&cl->cl_recs[bg->bg_chain].c_total,
723                      le16_to_cpu(bg->bg_bits));
724         cl->cl_recs[bg->bg_chain].c_blkno  = cpu_to_le64(bg_blkno);
725         if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
726                 le16_add_cpu(&cl->cl_next_free_rec, 1);
727
728         le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
729                                         le16_to_cpu(bg->bg_free_bits_count));
730         le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
731         le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
732
733         ocfs2_journal_dirty(handle, bh);
734
735         spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
736         OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
737         fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
738                                              le32_to_cpu(fe->i_clusters)));
739         spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
740         i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
741         alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
742
743         status = 0;
744
745         /* save the new last alloc group so that the caller can cache it. */
746         if (last_alloc_group)
747                 *last_alloc_group = ac->ac_last_group;
748
749 bail:
750         if (handle)
751                 ocfs2_commit_trans(osb, handle);
752
753         if (ocfs2_dealloc_has_cluster(&dealloc)) {
754                 ocfs2_schedule_truncate_log_flush(osb, 1);
755                 ocfs2_run_deallocs(osb, &dealloc);
756         }
757
758         if (ac)
759                 ocfs2_free_alloc_context(ac);
760
761         brelse(bg_bh);
762
763         mlog_exit(status);
764         return status;
765 }
766
767 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
768                                        struct ocfs2_alloc_context *ac,
769                                        int type,
770                                        u32 slot,
771                                        u64 *last_alloc_group,
772                                        int flags)
773 {
774         int status;
775         u32 bits_wanted = ac->ac_bits_wanted;
776         struct inode *alloc_inode;
777         struct buffer_head *bh = NULL;
778         struct ocfs2_dinode *fe;
779         u32 free_bits;
780
781         mlog_entry_void();
782
783         alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
784         if (!alloc_inode) {
785                 mlog_errno(-EINVAL);
786                 return -EINVAL;
787         }
788
789         mutex_lock(&alloc_inode->i_mutex);
790
791         status = ocfs2_inode_lock(alloc_inode, &bh, 1);
792         if (status < 0) {
793                 mutex_unlock(&alloc_inode->i_mutex);
794                 iput(alloc_inode);
795
796                 mlog_errno(status);
797                 return status;
798         }
799
800         ac->ac_inode = alloc_inode;
801         ac->ac_alloc_slot = slot;
802
803         fe = (struct ocfs2_dinode *) bh->b_data;
804
805         /* The bh was validated by the inode read inside
806          * ocfs2_inode_lock().  Any corruption is a code bug. */
807         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
808
809         if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
810                 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
811                             (unsigned long long)le64_to_cpu(fe->i_blkno));
812                 status = -EIO;
813                 goto bail;
814         }
815
816         free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
817                 le32_to_cpu(fe->id1.bitmap1.i_used);
818
819         if (bits_wanted > free_bits) {
820                 /* cluster bitmap never grows */
821                 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
822                         mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
823                              bits_wanted, free_bits);
824                         status = -ENOSPC;
825                         goto bail;
826                 }
827
828                 if (!(flags & ALLOC_NEW_GROUP)) {
829                         mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
830                              "and we don't alloc a new group for it.\n",
831                              slot, bits_wanted, free_bits);
832                         status = -ENOSPC;
833                         goto bail;
834                 }
835
836                 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
837                                                  ac->ac_max_block,
838                                                  last_alloc_group, flags);
839                 if (status < 0) {
840                         if (status != -ENOSPC)
841                                 mlog_errno(status);
842                         goto bail;
843                 }
844                 atomic_inc(&osb->alloc_stats.bg_extends);
845
846                 /* You should never ask for this much metadata */
847                 BUG_ON(bits_wanted >
848                        (le32_to_cpu(fe->id1.bitmap1.i_total)
849                         - le32_to_cpu(fe->id1.bitmap1.i_used)));
850         }
851
852         get_bh(bh);
853         ac->ac_bh = bh;
854 bail:
855         brelse(bh);
856
857         mlog_exit(status);
858         return status;
859 }
860
861 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
862 {
863         spin_lock(&osb->osb_lock);
864         osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
865         spin_unlock(&osb->osb_lock);
866         atomic_set(&osb->s_num_inodes_stolen, 0);
867 }
868
869 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
870 {
871         spin_lock(&osb->osb_lock);
872         osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
873         spin_unlock(&osb->osb_lock);
874         atomic_set(&osb->s_num_meta_stolen, 0);
875 }
876
877 void ocfs2_init_steal_slots(struct ocfs2_super *osb)
878 {
879         ocfs2_init_inode_steal_slot(osb);
880         ocfs2_init_meta_steal_slot(osb);
881 }
882
883 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
884 {
885         spin_lock(&osb->osb_lock);
886         if (type == INODE_ALLOC_SYSTEM_INODE)
887                 osb->s_inode_steal_slot = slot;
888         else if (type == EXTENT_ALLOC_SYSTEM_INODE)
889                 osb->s_meta_steal_slot = slot;
890         spin_unlock(&osb->osb_lock);
891 }
892
893 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
894 {
895         int slot = OCFS2_INVALID_SLOT;
896
897         spin_lock(&osb->osb_lock);
898         if (type == INODE_ALLOC_SYSTEM_INODE)
899                 slot = osb->s_inode_steal_slot;
900         else if (type == EXTENT_ALLOC_SYSTEM_INODE)
901                 slot = osb->s_meta_steal_slot;
902         spin_unlock(&osb->osb_lock);
903
904         return slot;
905 }
906
907 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
908 {
909         return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
910 }
911
912 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
913 {
914         return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
915 }
916
917 static int ocfs2_steal_resource(struct ocfs2_super *osb,
918                                 struct ocfs2_alloc_context *ac,
919                                 int type)
920 {
921         int i, status = -ENOSPC;
922         int slot = __ocfs2_get_steal_slot(osb, type);
923
924         /* Start to steal resource from the first slot after ours. */
925         if (slot == OCFS2_INVALID_SLOT)
926                 slot = osb->slot_num + 1;
927
928         for (i = 0; i < osb->max_slots; i++, slot++) {
929                 if (slot == osb->max_slots)
930                         slot = 0;
931
932                 if (slot == osb->slot_num)
933                         continue;
934
935                 status = ocfs2_reserve_suballoc_bits(osb, ac,
936                                                      type,
937                                                      (u32)slot, NULL,
938                                                      NOT_ALLOC_NEW_GROUP);
939                 if (status >= 0) {
940                         __ocfs2_set_steal_slot(osb, slot, type);
941                         break;
942                 }
943
944                 ocfs2_free_ac_resource(ac);
945         }
946
947         return status;
948 }
949
950 static int ocfs2_steal_inode(struct ocfs2_super *osb,
951                              struct ocfs2_alloc_context *ac)
952 {
953         return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
954 }
955
956 static int ocfs2_steal_meta(struct ocfs2_super *osb,
957                             struct ocfs2_alloc_context *ac)
958 {
959         return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
960 }
961
962 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
963                                       int blocks,
964                                       struct ocfs2_alloc_context **ac)
965 {
966         int status;
967         int slot = ocfs2_get_meta_steal_slot(osb);
968
969         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
970         if (!(*ac)) {
971                 status = -ENOMEM;
972                 mlog_errno(status);
973                 goto bail;
974         }
975
976         (*ac)->ac_bits_wanted = blocks;
977         (*ac)->ac_which = OCFS2_AC_USE_META;
978         (*ac)->ac_group_search = ocfs2_block_group_search;
979
980         if (slot != OCFS2_INVALID_SLOT &&
981                 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
982                 goto extent_steal;
983
984         atomic_set(&osb->s_num_meta_stolen, 0);
985         status = ocfs2_reserve_suballoc_bits(osb, (*ac),
986                                              EXTENT_ALLOC_SYSTEM_INODE,
987                                              (u32)osb->slot_num, NULL,
988                                              ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
989
990
991         if (status >= 0) {
992                 status = 0;
993                 if (slot != OCFS2_INVALID_SLOT)
994                         ocfs2_init_meta_steal_slot(osb);
995                 goto bail;
996         } else if (status < 0 && status != -ENOSPC) {
997                 mlog_errno(status);
998                 goto bail;
999         }
1000
1001         ocfs2_free_ac_resource(*ac);
1002
1003 extent_steal:
1004         status = ocfs2_steal_meta(osb, *ac);
1005         atomic_inc(&osb->s_num_meta_stolen);
1006         if (status < 0) {
1007                 if (status != -ENOSPC)
1008                         mlog_errno(status);
1009                 goto bail;
1010         }
1011
1012         status = 0;
1013 bail:
1014         if ((status < 0) && *ac) {
1015                 ocfs2_free_alloc_context(*ac);
1016                 *ac = NULL;
1017         }
1018
1019         mlog_exit(status);
1020         return status;
1021 }
1022
1023 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
1024                                struct ocfs2_extent_list *root_el,
1025                                struct ocfs2_alloc_context **ac)
1026 {
1027         return ocfs2_reserve_new_metadata_blocks(osb,
1028                                         ocfs2_extend_meta_needed(root_el),
1029                                         ac);
1030 }
1031
1032 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
1033                             struct ocfs2_alloc_context **ac)
1034 {
1035         int status;
1036         int slot = ocfs2_get_inode_steal_slot(osb);
1037         u64 alloc_group;
1038
1039         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1040         if (!(*ac)) {
1041                 status = -ENOMEM;
1042                 mlog_errno(status);
1043                 goto bail;
1044         }
1045
1046         (*ac)->ac_bits_wanted = 1;
1047         (*ac)->ac_which = OCFS2_AC_USE_INODE;
1048
1049         (*ac)->ac_group_search = ocfs2_block_group_search;
1050
1051         /*
1052          * stat(2) can't handle i_ino > 32bits, so we tell the
1053          * lower levels not to allocate us a block group past that
1054          * limit.  The 'inode64' mount option avoids this behavior.
1055          */
1056         if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
1057                 (*ac)->ac_max_block = (u32)~0U;
1058
1059         /*
1060          * slot is set when we successfully steal inode from other nodes.
1061          * It is reset in 3 places:
1062          * 1. when we flush the truncate log
1063          * 2. when we complete local alloc recovery.
1064          * 3. when we successfully allocate from our own slot.
1065          * After it is set, we will go on stealing inodes until we find the
1066          * need to check our slots to see whether there is some space for us.
1067          */
1068         if (slot != OCFS2_INVALID_SLOT &&
1069             atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
1070                 goto inode_steal;
1071
1072         atomic_set(&osb->s_num_inodes_stolen, 0);
1073         alloc_group = osb->osb_inode_alloc_group;
1074         status = ocfs2_reserve_suballoc_bits(osb, *ac,
1075                                              INODE_ALLOC_SYSTEM_INODE,
1076                                              (u32)osb->slot_num,
1077                                              &alloc_group,
1078                                              ALLOC_NEW_GROUP |
1079                                              ALLOC_GROUPS_FROM_GLOBAL);
1080         if (status >= 0) {
1081                 status = 0;
1082
1083                 spin_lock(&osb->osb_lock);
1084                 osb->osb_inode_alloc_group = alloc_group;
1085                 spin_unlock(&osb->osb_lock);
1086                 mlog(0, "after reservation, new allocation group is "
1087                      "%llu\n", (unsigned long long)alloc_group);
1088
1089                 /*
1090                  * Some inodes must be freed by us, so try to allocate
1091                  * from our own next time.
1092                  */
1093                 if (slot != OCFS2_INVALID_SLOT)
1094                         ocfs2_init_inode_steal_slot(osb);
1095                 goto bail;
1096         } else if (status < 0 && status != -ENOSPC) {
1097                 mlog_errno(status);
1098                 goto bail;
1099         }
1100
1101         ocfs2_free_ac_resource(*ac);
1102
1103 inode_steal:
1104         status = ocfs2_steal_inode(osb, *ac);
1105         atomic_inc(&osb->s_num_inodes_stolen);
1106         if (status < 0) {
1107                 if (status != -ENOSPC)
1108                         mlog_errno(status);
1109                 goto bail;
1110         }
1111
1112         status = 0;
1113 bail:
1114         if ((status < 0) && *ac) {
1115                 ocfs2_free_alloc_context(*ac);
1116                 *ac = NULL;
1117         }
1118
1119         mlog_exit(status);
1120         return status;
1121 }
1122
1123 /* local alloc code has to do the same thing, so rather than do this
1124  * twice.. */
1125 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
1126                                       struct ocfs2_alloc_context *ac)
1127 {
1128         int status;
1129
1130         ac->ac_which = OCFS2_AC_USE_MAIN;
1131         ac->ac_group_search = ocfs2_cluster_group_search;
1132
1133         status = ocfs2_reserve_suballoc_bits(osb, ac,
1134                                              GLOBAL_BITMAP_SYSTEM_INODE,
1135                                              OCFS2_INVALID_SLOT, NULL,
1136                                              ALLOC_NEW_GROUP);
1137         if (status < 0 && status != -ENOSPC) {
1138                 mlog_errno(status);
1139                 goto bail;
1140         }
1141
1142 bail:
1143         return status;
1144 }
1145
1146 /* Callers don't need to care which bitmap (local alloc or main) to
1147  * use so we figure it out for them, but unfortunately this clutters
1148  * things a bit. */
1149 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
1150                                              u32 bits_wanted, u64 max_block,
1151                                              int flags,
1152                                              struct ocfs2_alloc_context **ac)
1153 {
1154         int status;
1155
1156         mlog_entry_void();
1157
1158         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1159         if (!(*ac)) {
1160                 status = -ENOMEM;
1161                 mlog_errno(status);
1162                 goto bail;
1163         }
1164
1165         (*ac)->ac_bits_wanted = bits_wanted;
1166         (*ac)->ac_max_block = max_block;
1167
1168         status = -ENOSPC;
1169         if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
1170             ocfs2_alloc_should_use_local(osb, bits_wanted)) {
1171                 status = ocfs2_reserve_local_alloc_bits(osb,
1172                                                         bits_wanted,
1173                                                         *ac);
1174                 if ((status < 0) && (status != -ENOSPC)) {
1175                         mlog_errno(status);
1176                         goto bail;
1177                 }
1178         }
1179
1180         if (status == -ENOSPC) {
1181                 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1182                 if (status < 0) {
1183                         if (status != -ENOSPC)
1184                                 mlog_errno(status);
1185                         goto bail;
1186                 }
1187         }
1188
1189         status = 0;
1190 bail:
1191         if ((status < 0) && *ac) {
1192                 ocfs2_free_alloc_context(*ac);
1193                 *ac = NULL;
1194         }
1195
1196         mlog_exit(status);
1197         return status;
1198 }
1199
1200 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
1201                            u32 bits_wanted,
1202                            struct ocfs2_alloc_context **ac)
1203 {
1204         return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
1205                                                  ALLOC_NEW_GROUP, ac);
1206 }
1207
1208 /*
1209  * More or less lifted from ext3. I'll leave their description below:
1210  *
1211  * "For ext3 allocations, we must not reuse any blocks which are
1212  * allocated in the bitmap buffer's "last committed data" copy.  This
1213  * prevents deletes from freeing up the page for reuse until we have
1214  * committed the delete transaction.
1215  *
1216  * If we didn't do this, then deleting something and reallocating it as
1217  * data would allow the old block to be overwritten before the
1218  * transaction committed (because we force data to disk before commit).
1219  * This would lead to corruption if we crashed between overwriting the
1220  * data and committing the delete.
1221  *
1222  * @@@ We may want to make this allocation behaviour conditional on
1223  * data-writes at some point, and disable it for metadata allocations or
1224  * sync-data inodes."
1225  *
1226  * Note: OCFS2 already does this differently for metadata vs data
1227  * allocations, as those bitmaps are separate and undo access is never
1228  * called on a metadata group descriptor.
1229  */
1230 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1231                                          int nr)
1232 {
1233         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1234         int ret;
1235
1236         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1237                 return 0;
1238
1239         if (!buffer_jbd(bg_bh))
1240                 return 1;
1241
1242         jbd_lock_bh_state(bg_bh);
1243         bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
1244         if (bg)
1245                 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1246         else
1247                 ret = 1;
1248         jbd_unlock_bh_state(bg_bh);
1249
1250         return ret;
1251 }
1252
1253 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1254                                              struct buffer_head *bg_bh,
1255                                              unsigned int bits_wanted,
1256                                              unsigned int total_bits,
1257                                              struct ocfs2_suballoc_result *res)
1258 {
1259         void *bitmap;
1260         u16 best_offset, best_size;
1261         int offset, start, found, status = 0;
1262         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1263
1264         /* Callers got this descriptor from
1265          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1266         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1267
1268         found = start = best_offset = best_size = 0;
1269         bitmap = bg->bg_bitmap;
1270
1271         while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
1272                 if (offset == total_bits)
1273                         break;
1274
1275                 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1276                         /* We found a zero, but we can't use it as it
1277                          * hasn't been put to disk yet! */
1278                         found = 0;
1279                         start = offset + 1;
1280                 } else if (offset == start) {
1281                         /* we found a zero */
1282                         found++;
1283                         /* move start to the next bit to test */
1284                         start++;
1285                 } else {
1286                         /* got a zero after some ones */
1287                         found = 1;
1288                         start = offset + 1;
1289                 }
1290                 if (found > best_size) {
1291                         best_size = found;
1292                         best_offset = start - found;
1293                 }
1294                 /* we got everything we needed */
1295                 if (found == bits_wanted) {
1296                         /* mlog(0, "Found it all!\n"); */
1297                         break;
1298                 }
1299         }
1300
1301         if (best_size) {
1302                 res->sr_bit_offset = best_offset;
1303                 res->sr_bits = best_size;
1304         } else {
1305                 status = -ENOSPC;
1306                 /* No error log here -- see the comment above
1307                  * ocfs2_test_bg_bit_allocatable */
1308         }
1309
1310         return status;
1311 }
1312
1313 static inline int ocfs2_block_group_set_bits(handle_t *handle,
1314                                              struct inode *alloc_inode,
1315                                              struct ocfs2_group_desc *bg,
1316                                              struct buffer_head *group_bh,
1317                                              unsigned int bit_off,
1318                                              unsigned int num_bits)
1319 {
1320         int status;
1321         void *bitmap = bg->bg_bitmap;
1322         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1323
1324         mlog_entry_void();
1325
1326         /* All callers get the descriptor via
1327          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1328         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1329         BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1330
1331         mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
1332              num_bits);
1333
1334         if (ocfs2_is_cluster_bitmap(alloc_inode))
1335                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1336
1337         status = ocfs2_journal_access_gd(handle,
1338                                          INODE_CACHE(alloc_inode),
1339                                          group_bh,
1340                                          journal_type);
1341         if (status < 0) {
1342                 mlog_errno(status);
1343                 goto bail;
1344         }
1345
1346         le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1347         while(num_bits--)
1348                 ocfs2_set_bit(bit_off++, bitmap);
1349
1350         ocfs2_journal_dirty(handle, group_bh);
1351
1352 bail:
1353         mlog_exit(status);
1354         return status;
1355 }
1356
1357 /* find the one with the most empty bits */
1358 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1359 {
1360         u16 curr, best;
1361
1362         BUG_ON(!cl->cl_next_free_rec);
1363
1364         best = curr = 0;
1365         while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1366                 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1367                     le32_to_cpu(cl->cl_recs[best].c_free))
1368                         best = curr;
1369                 curr++;
1370         }
1371
1372         BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1373         return best;
1374 }
1375
1376 static int ocfs2_relink_block_group(handle_t *handle,
1377                                     struct inode *alloc_inode,
1378                                     struct buffer_head *fe_bh,
1379                                     struct buffer_head *bg_bh,
1380                                     struct buffer_head *prev_bg_bh,
1381                                     u16 chain)
1382 {
1383         int status;
1384         /* there is a really tiny chance the journal calls could fail,
1385          * but we wouldn't want inconsistent blocks in *any* case. */
1386         u64 fe_ptr, bg_ptr, prev_bg_ptr;
1387         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1388         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1389         struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1390
1391         /* The caller got these descriptors from
1392          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1393         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1394         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1395
1396         mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
1397              (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1398              (unsigned long long)le64_to_cpu(bg->bg_blkno),
1399              (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1400
1401         fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
1402         bg_ptr = le64_to_cpu(bg->bg_next_group);
1403         prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1404
1405         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1406                                          prev_bg_bh,
1407                                          OCFS2_JOURNAL_ACCESS_WRITE);
1408         if (status < 0) {
1409                 mlog_errno(status);
1410                 goto out_rollback;
1411         }
1412
1413         prev_bg->bg_next_group = bg->bg_next_group;
1414         ocfs2_journal_dirty(handle, prev_bg_bh);
1415
1416         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1417                                          bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1418         if (status < 0) {
1419                 mlog_errno(status);
1420                 goto out_rollback;
1421         }
1422
1423         bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1424         ocfs2_journal_dirty(handle, bg_bh);
1425
1426         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1427                                          fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1428         if (status < 0) {
1429                 mlog_errno(status);
1430                 goto out_rollback;
1431         }
1432
1433         fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1434         ocfs2_journal_dirty(handle, fe_bh);
1435
1436 out_rollback:
1437         if (status < 0) {
1438                 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
1439                 bg->bg_next_group = cpu_to_le64(bg_ptr);
1440                 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1441         }
1442
1443         mlog_exit(status);
1444         return status;
1445 }
1446
1447 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1448                                                      u32 wanted)
1449 {
1450         return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1451 }
1452
1453 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1454  * value on error. */
1455 static int ocfs2_cluster_group_search(struct inode *inode,
1456                                       struct buffer_head *group_bh,
1457                                       u32 bits_wanted, u32 min_bits,
1458                                       u64 max_block,
1459                                       struct ocfs2_suballoc_result *res)
1460 {
1461         int search = -ENOSPC;
1462         int ret;
1463         u64 blkoff;
1464         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1465         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1466         unsigned int max_bits, gd_cluster_off;
1467
1468         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1469
1470         if (gd->bg_free_bits_count) {
1471                 max_bits = le16_to_cpu(gd->bg_bits);
1472
1473                 /* Tail groups in cluster bitmaps which aren't cpg
1474                  * aligned are prone to partial extention by a failed
1475                  * fs resize. If the file system resize never got to
1476                  * update the dinode cluster count, then we don't want
1477                  * to trust any clusters past it, regardless of what
1478                  * the group descriptor says. */
1479                 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1480                                                           le64_to_cpu(gd->bg_blkno));
1481                 if ((gd_cluster_off + max_bits) >
1482                     OCFS2_I(inode)->ip_clusters) {
1483                         max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1484                         mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
1485                              (unsigned long long)le64_to_cpu(gd->bg_blkno),
1486                              le16_to_cpu(gd->bg_bits),
1487                              OCFS2_I(inode)->ip_clusters, max_bits);
1488                 }
1489
1490                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1491                                                         group_bh, bits_wanted,
1492                                                         max_bits, res);
1493                 if (ret)
1494                         return ret;
1495
1496                 if (max_block) {
1497                         blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1498                                                           gd_cluster_off +
1499                                                           res->sr_bit_offset +
1500                                                           res->sr_bits);
1501                         mlog(0, "Checking %llu against %llu\n",
1502                              (unsigned long long)blkoff,
1503                              (unsigned long long)max_block);
1504                         if (blkoff > max_block)
1505                                 return -ENOSPC;
1506                 }
1507
1508                 /* ocfs2_block_group_find_clear_bits() might
1509                  * return success, but we still want to return
1510                  * -ENOSPC unless it found the minimum number
1511                  * of bits. */
1512                 if (min_bits <= res->sr_bits)
1513                         search = 0; /* success */
1514                 else if (res->sr_bits) {
1515                         /*
1516                          * Don't show bits which we'll be returning
1517                          * for allocation to the local alloc bitmap.
1518                          */
1519                         ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
1520                 }
1521         }
1522
1523         return search;
1524 }
1525
1526 static int ocfs2_block_group_search(struct inode *inode,
1527                                     struct buffer_head *group_bh,
1528                                     u32 bits_wanted, u32 min_bits,
1529                                     u64 max_block,
1530                                     struct ocfs2_suballoc_result *res)
1531 {
1532         int ret = -ENOSPC;
1533         u64 blkoff;
1534         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1535
1536         BUG_ON(min_bits != 1);
1537         BUG_ON(ocfs2_is_cluster_bitmap(inode));
1538
1539         if (bg->bg_free_bits_count) {
1540                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1541                                                         group_bh, bits_wanted,
1542                                                         le16_to_cpu(bg->bg_bits),
1543                                                         res);
1544                 if (!ret && max_block) {
1545                         blkoff = le64_to_cpu(bg->bg_blkno) +
1546                                 res->sr_bit_offset + res->sr_bits;
1547                         mlog(0, "Checking %llu against %llu\n",
1548                              (unsigned long long)blkoff,
1549                              (unsigned long long)max_block);
1550                         if (blkoff > max_block)
1551                                 ret = -ENOSPC;
1552                 }
1553         }
1554
1555         return ret;
1556 }
1557
1558 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1559                                        handle_t *handle,
1560                                        struct buffer_head *di_bh,
1561                                        u32 num_bits,
1562                                        u16 chain)
1563 {
1564         int ret;
1565         u32 tmp_used;
1566         struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1567         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1568
1569         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1570                                       OCFS2_JOURNAL_ACCESS_WRITE);
1571         if (ret < 0) {
1572                 mlog_errno(ret);
1573                 goto out;
1574         }
1575
1576         tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1577         di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1578         le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1579         ocfs2_journal_dirty(handle, di_bh);
1580
1581 out:
1582         return ret;
1583 }
1584
1585 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1586                                          struct ocfs2_extent_rec *rec,
1587                                          struct ocfs2_chain_list *cl)
1588 {
1589         unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1590         unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1591         unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc;
1592
1593         if (res->sr_bit_offset < bitoff)
1594                 return 0;
1595         if (res->sr_bit_offset >= (bitoff + bitcount))
1596                 return 0;
1597         res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1598                 (res->sr_bit_offset - bitoff);
1599         if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1600                 res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1601         return 1;
1602 }
1603
1604 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1605                                           struct ocfs2_group_desc *bg,
1606                                           struct ocfs2_suballoc_result *res)
1607 {
1608         int i;
1609         u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
1610         struct ocfs2_extent_rec *rec;
1611         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1612         struct ocfs2_chain_list *cl = &di->id2.i_chain;
1613
1614         if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1615                 res->sr_blkno = 0;
1616                 return;
1617         }
1618
1619         res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1620         res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
1621         if (!ocfs2_supports_discontig_bh(OCFS2_SB(ac->ac_inode->i_sb)) ||
1622             !bg->bg_list.l_next_free_rec)
1623                 return;
1624
1625         for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1626                 rec = &bg->bg_list.l_recs[i];
1627                 if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1628                         res->sr_bg_blkno = bg_blkno;  /* Restore */
1629                         break;
1630                 }
1631         }
1632 }
1633
1634 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1635                                   handle_t *handle,
1636                                   u32 bits_wanted,
1637                                   u32 min_bits,
1638                                   struct ocfs2_suballoc_result *res,
1639                                   u16 *bits_left)
1640 {
1641         int ret;
1642         struct buffer_head *group_bh = NULL;
1643         struct ocfs2_group_desc *gd;
1644         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1645         struct inode *alloc_inode = ac->ac_inode;
1646
1647         ret = ocfs2_read_group_descriptor(alloc_inode, di,
1648                                           res->sr_bg_blkno, &group_bh);
1649         if (ret < 0) {
1650                 mlog_errno(ret);
1651                 return ret;
1652         }
1653
1654         gd = (struct ocfs2_group_desc *) group_bh->b_data;
1655         ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1656                                   ac->ac_max_block, res);
1657         if (ret < 0) {
1658                 if (ret != -ENOSPC)
1659                         mlog_errno(ret);
1660                 goto out;
1661         }
1662
1663         if (!ret)
1664                 ocfs2_bg_discontig_fix_result(ac, gd, res);
1665
1666         ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1667                                                res->sr_bits,
1668                                                le16_to_cpu(gd->bg_chain));
1669         if (ret < 0) {
1670                 mlog_errno(ret);
1671                 goto out;
1672         }
1673
1674         ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1675                                          res->sr_bit_offset, res->sr_bits);
1676         if (ret < 0)
1677                 mlog_errno(ret);
1678
1679         *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1680
1681 out:
1682         brelse(group_bh);
1683
1684         return ret;
1685 }
1686
1687 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1688                               handle_t *handle,
1689                               u32 bits_wanted,
1690                               u32 min_bits,
1691                               struct ocfs2_suballoc_result *res,
1692                               u16 *bits_left)
1693 {
1694         int status;
1695         u16 chain;
1696         u32 tmp_used;
1697         u64 next_group;
1698         struct inode *alloc_inode = ac->ac_inode;
1699         struct buffer_head *group_bh = NULL;
1700         struct buffer_head *prev_group_bh = NULL;
1701         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1702         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1703         struct ocfs2_group_desc *bg;
1704
1705         chain = ac->ac_chain;
1706         mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1707              bits_wanted, chain,
1708              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1709
1710         status = ocfs2_read_group_descriptor(alloc_inode, fe,
1711                                              le64_to_cpu(cl->cl_recs[chain].c_blkno),
1712                                              &group_bh);
1713         if (status < 0) {
1714                 mlog_errno(status);
1715                 goto bail;
1716         }
1717         bg = (struct ocfs2_group_desc *) group_bh->b_data;
1718
1719         status = -ENOSPC;
1720         /* for now, the chain search is a bit simplistic. We just use
1721          * the 1st group with any empty bits. */
1722         while ((status = ac->ac_group_search(alloc_inode, group_bh,
1723                                              bits_wanted, min_bits,
1724                                              ac->ac_max_block,
1725                                              res)) == -ENOSPC) {
1726                 if (!bg->bg_next_group)
1727                         break;
1728
1729                 brelse(prev_group_bh);
1730                 prev_group_bh = NULL;
1731
1732                 next_group = le64_to_cpu(bg->bg_next_group);
1733                 prev_group_bh = group_bh;
1734                 group_bh = NULL;
1735                 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1736                                                      next_group, &group_bh);
1737                 if (status < 0) {
1738                         mlog_errno(status);
1739                         goto bail;
1740                 }
1741                 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1742         }
1743         if (status < 0) {
1744                 if (status != -ENOSPC)
1745                         mlog_errno(status);
1746                 goto bail;
1747         }
1748
1749         mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1750              res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1751
1752         res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1753
1754         BUG_ON(res->sr_bits == 0);
1755         if (!status)
1756                 ocfs2_bg_discontig_fix_result(ac, bg, res);
1757
1758
1759         /*
1760          * Keep track of previous block descriptor read. When
1761          * we find a target, if we have read more than X
1762          * number of descriptors, and the target is reasonably
1763          * empty, relink him to top of his chain.
1764          *
1765          * We've read 0 extra blocks and only send one more to
1766          * the transaction, yet the next guy to search has a
1767          * much easier time.
1768          *
1769          * Do this *after* figuring out how many bits we're taking out
1770          * of our target group.
1771          */
1772         if (ac->ac_allow_chain_relink &&
1773             (prev_group_bh) &&
1774             (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1775                 status = ocfs2_relink_block_group(handle, alloc_inode,
1776                                                   ac->ac_bh, group_bh,
1777                                                   prev_group_bh, chain);
1778                 if (status < 0) {
1779                         mlog_errno(status);
1780                         goto bail;
1781                 }
1782         }
1783
1784         /* Ok, claim our bits now: set the info on dinode, chainlist
1785          * and then the group */
1786         status = ocfs2_journal_access_di(handle,
1787                                          INODE_CACHE(alloc_inode),
1788                                          ac->ac_bh,
1789                                          OCFS2_JOURNAL_ACCESS_WRITE);
1790         if (status < 0) {
1791                 mlog_errno(status);
1792                 goto bail;
1793         }
1794
1795         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1796         fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used);
1797         le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits);
1798         ocfs2_journal_dirty(handle, ac->ac_bh);
1799
1800         status = ocfs2_block_group_set_bits(handle,
1801                                             alloc_inode,
1802                                             bg,
1803                                             group_bh,
1804                                             res->sr_bit_offset,
1805                                             res->sr_bits);
1806         if (status < 0) {
1807                 mlog_errno(status);
1808                 goto bail;
1809         }
1810
1811         mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
1812              (unsigned long long)le64_to_cpu(fe->i_blkno));
1813
1814         *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1815 bail:
1816         brelse(group_bh);
1817         brelse(prev_group_bh);
1818
1819         mlog_exit(status);
1820         return status;
1821 }
1822
1823 /* will give out up to bits_wanted contiguous bits. */
1824 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1825                                      handle_t *handle,
1826                                      u32 bits_wanted,
1827                                      u32 min_bits,
1828                                      struct ocfs2_suballoc_result *res)
1829 {
1830         int status;
1831         u16 victim, i;
1832         u16 bits_left = 0;
1833         struct ocfs2_chain_list *cl;
1834         struct ocfs2_dinode *fe;
1835
1836         mlog_entry_void();
1837
1838         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1839         BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1840         BUG_ON(!ac->ac_bh);
1841
1842         fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1843
1844         /* The bh was validated by the inode read during
1845          * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1846         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1847
1848         if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1849             le32_to_cpu(fe->id1.bitmap1.i_total)) {
1850                 ocfs2_error(ac->ac_inode->i_sb,
1851                             "Chain allocator dinode %llu has %u used "
1852                             "bits but only %u total.",
1853                             (unsigned long long)le64_to_cpu(fe->i_blkno),
1854                             le32_to_cpu(fe->id1.bitmap1.i_used),
1855                             le32_to_cpu(fe->id1.bitmap1.i_total));
1856                 status = -EIO;
1857                 goto bail;
1858         }
1859
1860         res->sr_bg_blkno = ac->ac_last_group;
1861         if (res->sr_bg_blkno) {
1862                 /* Attempt to short-circuit the usual search mechanism
1863                  * by jumping straight to the most recently used
1864                  * allocation group. This helps us mantain some
1865                  * contiguousness across allocations. */
1866                 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1867                                                 min_bits, res, &bits_left);
1868                 if (!status)
1869                         goto set_hint;
1870                 if (status < 0 && status != -ENOSPC) {
1871                         mlog_errno(status);
1872                         goto bail;
1873                 }
1874         }
1875
1876         cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1877
1878         victim = ocfs2_find_victim_chain(cl);
1879         ac->ac_chain = victim;
1880         ac->ac_allow_chain_relink = 1;
1881
1882         status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1883                                     res, &bits_left);
1884         if (!status)
1885                 goto set_hint;
1886         if (status < 0 && status != -ENOSPC) {
1887                 mlog_errno(status);
1888                 goto bail;
1889         }
1890
1891         mlog(0, "Search of victim chain %u came up with nothing, "
1892              "trying all chains now.\n", victim);
1893
1894         /* If we didn't pick a good victim, then just default to
1895          * searching each chain in order. Don't allow chain relinking
1896          * because we only calculate enough journal credits for one
1897          * relink per alloc. */
1898         ac->ac_allow_chain_relink = 0;
1899         for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1900                 if (i == victim)
1901                         continue;
1902                 if (!cl->cl_recs[i].c_free)
1903                         continue;
1904
1905                 ac->ac_chain = i;
1906                 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1907                                             res, &bits_left);
1908                 if (!status)
1909                         break;
1910                 if (status < 0 && status != -ENOSPC) {
1911                         mlog_errno(status);
1912                         goto bail;
1913                 }
1914         }
1915
1916 set_hint:
1917         if (status != -ENOSPC) {
1918                 /* If the next search of this group is not likely to
1919                  * yield a suitable extent, then we reset the last
1920                  * group hint so as to not waste a disk read */
1921                 if (bits_left < min_bits)
1922                         ac->ac_last_group = 0;
1923                 else
1924                         ac->ac_last_group = res->sr_bg_blkno;
1925         }
1926
1927 bail:
1928         mlog_exit(status);
1929         return status;
1930 }
1931
1932 int ocfs2_claim_metadata(handle_t *handle,
1933                          struct ocfs2_alloc_context *ac,
1934                          u32 bits_wanted,
1935                          u64 *suballoc_loc,
1936                          u16 *suballoc_bit_start,
1937                          unsigned int *num_bits,
1938                          u64 *blkno_start)
1939 {
1940         int status;
1941         struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
1942
1943         BUG_ON(!ac);
1944         BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1945         BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1946
1947         status = ocfs2_claim_suballoc_bits(ac,
1948                                            handle,
1949                                            bits_wanted,
1950                                            1,
1951                                            &res);
1952         if (status < 0) {
1953                 mlog_errno(status);
1954                 goto bail;
1955         }
1956         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
1957
1958         *suballoc_loc = res.sr_bg_blkno;
1959         *suballoc_bit_start = res.sr_bit_offset;
1960         *blkno_start = res.sr_blkno;
1961         ac->ac_bits_given += res.sr_bits;
1962         *num_bits = res.sr_bits;
1963         status = 0;
1964 bail:
1965         mlog_exit(status);
1966         return status;
1967 }
1968
1969 static void ocfs2_init_inode_ac_group(struct inode *dir,
1970                                       struct buffer_head *parent_fe_bh,
1971                                       struct ocfs2_alloc_context *ac)
1972 {
1973         struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
1974         /*
1975          * Try to allocate inodes from some specific group.
1976          *
1977          * If the parent dir has recorded the last group used in allocation,
1978          * cool, use it. Otherwise if we try to allocate new inode from the
1979          * same slot the parent dir belongs to, use the same chunk.
1980          *
1981          * We are very careful here to avoid the mistake of setting
1982          * ac_last_group to a group descriptor from a different (unlocked) slot.
1983          */
1984         if (OCFS2_I(dir)->ip_last_used_group &&
1985             OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
1986                 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
1987         else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
1988                 ac->ac_last_group = ocfs2_which_suballoc_group(
1989                                         le64_to_cpu(fe->i_blkno),
1990                                         le16_to_cpu(fe->i_suballoc_bit));
1991 }
1992
1993 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
1994                                              struct ocfs2_alloc_context *ac)
1995 {
1996         OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
1997         OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
1998 }
1999
2000 int ocfs2_claim_new_inode(handle_t *handle,
2001                           struct inode *dir,
2002                           struct buffer_head *parent_fe_bh,
2003                           struct ocfs2_alloc_context *ac,
2004                           u64 *suballoc_loc,
2005                           u16 *suballoc_bit,
2006                           u64 *fe_blkno)
2007 {
2008         int status;
2009         struct ocfs2_suballoc_result res;
2010
2011         mlog_entry_void();
2012
2013         BUG_ON(!ac);
2014         BUG_ON(ac->ac_bits_given != 0);
2015         BUG_ON(ac->ac_bits_wanted != 1);
2016         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2017
2018         ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2019
2020         status = ocfs2_claim_suballoc_bits(ac,
2021                                            handle,
2022                                            1,
2023                                            1,
2024                                            &res);
2025         if (status < 0) {
2026                 mlog_errno(status);
2027                 goto bail;
2028         }
2029         atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2030
2031         BUG_ON(res.sr_bits != 1);
2032
2033         *suballoc_loc = res.sr_bg_blkno;
2034         *suballoc_bit = res.sr_bit_offset;
2035         *fe_blkno = res.sr_blkno;
2036         ac->ac_bits_given++;
2037         ocfs2_save_inode_ac_group(dir, ac);
2038         status = 0;
2039 bail:
2040         mlog_exit(status);
2041         return status;
2042 }
2043
2044 /* translate a group desc. blkno and it's bitmap offset into
2045  * disk cluster offset. */
2046 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
2047                                                    u64 bg_blkno,
2048                                                    u16 bg_bit_off)
2049 {
2050         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2051         u32 cluster = 0;
2052
2053         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2054
2055         if (bg_blkno != osb->first_cluster_group_blkno)
2056                 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
2057         cluster += (u32) bg_bit_off;
2058         return cluster;
2059 }
2060
2061 /* given a cluster offset, calculate which block group it belongs to
2062  * and return that block offset. */
2063 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
2064 {
2065         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2066         u32 group_no;
2067
2068         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2069
2070         group_no = cluster / osb->bitmap_cpg;
2071         if (!group_no)
2072                 return osb->first_cluster_group_blkno;
2073         return ocfs2_clusters_to_blocks(inode->i_sb,
2074                                         group_no * osb->bitmap_cpg);
2075 }
2076
2077 /* given the block number of a cluster start, calculate which cluster
2078  * group and descriptor bitmap offset that corresponds to. */
2079 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
2080                                                 u64 data_blkno,
2081                                                 u64 *bg_blkno,
2082                                                 u16 *bg_bit_off)
2083 {
2084         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2085         u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
2086
2087         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2088
2089         *bg_blkno = ocfs2_which_cluster_group(inode,
2090                                               data_cluster);
2091
2092         if (*bg_blkno == osb->first_cluster_group_blkno)
2093                 *bg_bit_off = (u16) data_cluster;
2094         else
2095                 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
2096                                                              data_blkno - *bg_blkno);
2097 }
2098
2099 /*
2100  * min_bits - minimum contiguous chunk from this total allocation we
2101  * can handle. set to what we asked for originally for a full
2102  * contig. allocation, set to '1' to indicate we can deal with extents
2103  * of any size.
2104  */
2105 int __ocfs2_claim_clusters(handle_t *handle,
2106                            struct ocfs2_alloc_context *ac,
2107                            u32 min_clusters,
2108                            u32 max_clusters,
2109                            u32 *cluster_start,
2110                            u32 *num_clusters)
2111 {
2112         int status;
2113         unsigned int bits_wanted = max_clusters;
2114         struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2115         struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
2116
2117         mlog_entry_void();
2118
2119         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2120
2121         BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
2122                && ac->ac_which != OCFS2_AC_USE_MAIN);
2123
2124         if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2125                 WARN_ON(min_clusters > 1);
2126
2127                 status = ocfs2_claim_local_alloc_bits(osb,
2128                                                       handle,
2129                                                       ac,
2130                                                       bits_wanted,
2131                                                       cluster_start,
2132                                                       num_clusters);
2133                 if (!status)
2134                         atomic_inc(&osb->alloc_stats.local_data);
2135         } else {
2136                 if (min_clusters > (osb->bitmap_cpg - 1)) {
2137                         /* The only paths asking for contiguousness
2138                          * should know about this already. */
2139                         mlog(ML_ERROR, "minimum allocation requested %u exceeds "
2140                              "group bitmap size %u!\n", min_clusters,
2141                              osb->bitmap_cpg);
2142                         status = -ENOSPC;
2143                         goto bail;
2144                 }
2145                 /* clamp the current request down to a realistic size. */
2146                 if (bits_wanted > (osb->bitmap_cpg - 1))
2147                         bits_wanted = osb->bitmap_cpg - 1;
2148
2149                 status = ocfs2_claim_suballoc_bits(ac,
2150                                                    handle,
2151                                                    bits_wanted,
2152                                                    min_clusters,
2153                                                    &res);
2154                 if (!status) {
2155                         BUG_ON(res.sr_blkno); /* cluster alloc can't set */
2156                         *cluster_start =
2157                                 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
2158                                                                  res.sr_bg_blkno,
2159                                                                  res.sr_bit_offset);
2160                         atomic_inc(&osb->alloc_stats.bitmap_data);
2161                 }
2162         }
2163         if (status < 0) {
2164                 if (status != -ENOSPC)
2165                         mlog_errno(status);
2166                 goto bail;
2167         }
2168
2169         ac->ac_bits_given += res.sr_bits;
2170         *num_clusters = res.sr_bits;
2171
2172 bail:
2173         mlog_exit(status);
2174         return status;
2175 }
2176
2177 int ocfs2_claim_clusters(handle_t *handle,
2178                          struct ocfs2_alloc_context *ac,
2179                          u32 min_clusters,
2180                          u32 *cluster_start,
2181                          u32 *num_clusters)
2182 {
2183         unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
2184
2185         return __ocfs2_claim_clusters(handle, ac, min_clusters,
2186                                       bits_wanted, cluster_start, num_clusters);
2187 }
2188
2189 static int ocfs2_block_group_clear_bits(handle_t *handle,
2190                                         struct inode *alloc_inode,
2191                                         struct ocfs2_group_desc *bg,
2192                                         struct buffer_head *group_bh,
2193                                         unsigned int bit_off,
2194                                         unsigned int num_bits,
2195                                         void (*undo_fn)(unsigned int bit,
2196                                                         unsigned long *bmap))
2197 {
2198         int status;
2199         unsigned int tmp;
2200         struct ocfs2_group_desc *undo_bg = NULL;
2201
2202         mlog_entry_void();
2203
2204         /* The caller got this descriptor from
2205          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
2206         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
2207
2208         mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
2209
2210         BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2211         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2212                                          group_bh,
2213                                          undo_fn ?
2214                                          OCFS2_JOURNAL_ACCESS_UNDO :
2215                                          OCFS2_JOURNAL_ACCESS_WRITE);
2216         if (status < 0) {
2217                 mlog_errno(status);
2218                 goto bail;
2219         }
2220
2221         if (undo_fn) {
2222                 jbd_lock_bh_state(group_bh);
2223                 undo_bg = (struct ocfs2_group_desc *)
2224                                         bh2jh(group_bh)->b_committed_data;
2225                 BUG_ON(!undo_bg);
2226         }
2227
2228         tmp = num_bits;
2229         while(tmp--) {
2230                 ocfs2_clear_bit((bit_off + tmp),
2231                                 (unsigned long *) bg->bg_bitmap);
2232                 if (undo_fn)
2233                         undo_fn(bit_off + tmp,
2234                                 (unsigned long *) undo_bg->bg_bitmap);
2235         }
2236         le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2237
2238         if (undo_fn)
2239                 jbd_unlock_bh_state(group_bh);
2240
2241         ocfs2_journal_dirty(handle, group_bh);
2242 bail:
2243         return status;
2244 }
2245
2246 /*
2247  * expects the suballoc inode to already be locked.
2248  */
2249 static int _ocfs2_free_suballoc_bits(handle_t *handle,
2250                                      struct inode *alloc_inode,
2251                                      struct buffer_head *alloc_bh,
2252                                      unsigned int start_bit,
2253                                      u64 bg_blkno,
2254                                      unsigned int count,
2255                                      void (*undo_fn)(unsigned int bit,
2256                                                      unsigned long *bitmap))
2257 {
2258         int status = 0;
2259         u32 tmp_used;
2260         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2261         struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2262         struct buffer_head *group_bh = NULL;
2263         struct ocfs2_group_desc *group;
2264
2265         mlog_entry_void();
2266
2267         /* The alloc_bh comes from ocfs2_free_dinode() or
2268          * ocfs2_free_clusters().  The callers have all locked the
2269          * allocator and gotten alloc_bh from the lock call.  This
2270          * validates the dinode buffer.  Any corruption that has happended
2271          * is a code bug. */
2272         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2273         BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2274
2275         mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
2276              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
2277              (unsigned long long)bg_blkno, start_bit);
2278
2279         status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2280                                              &group_bh);
2281         if (status < 0) {
2282                 mlog_errno(status);
2283                 goto bail;
2284         }
2285         group = (struct ocfs2_group_desc *) group_bh->b_data;
2286
2287         BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2288
2289         status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2290                                               group, group_bh,
2291                                               start_bit, count, undo_fn);
2292         if (status < 0) {
2293                 mlog_errno(status);
2294                 goto bail;
2295         }
2296
2297         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2298                                          alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2299         if (status < 0) {
2300                 mlog_errno(status);
2301                 goto bail;
2302         }
2303
2304         le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
2305                      count);
2306         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2307         fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2308         ocfs2_journal_dirty(handle, alloc_bh);
2309
2310 bail:
2311         brelse(group_bh);
2312
2313         mlog_exit(status);
2314         return status;
2315 }
2316
2317 int ocfs2_free_suballoc_bits(handle_t *handle,
2318                              struct inode *alloc_inode,
2319                              struct buffer_head *alloc_bh,
2320                              unsigned int start_bit,
2321                              u64 bg_blkno,
2322                              unsigned int count)
2323 {
2324         return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2325                                          start_bit, bg_blkno, count, NULL);
2326 }
2327
2328 int ocfs2_free_dinode(handle_t *handle,
2329                       struct inode *inode_alloc_inode,
2330                       struct buffer_head *inode_alloc_bh,
2331                       struct ocfs2_dinode *di)
2332 {
2333         u64 blk = le64_to_cpu(di->i_blkno);
2334         u16 bit = le16_to_cpu(di->i_suballoc_bit);
2335         u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2336
2337         return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2338                                         inode_alloc_bh, bit, bg_blkno, 1);
2339 }
2340
2341 static int _ocfs2_free_clusters(handle_t *handle,
2342                                 struct inode *bitmap_inode,
2343                                 struct buffer_head *bitmap_bh,
2344                                 u64 start_blk,
2345                                 unsigned int num_clusters,
2346                                 void (*undo_fn)(unsigned int bit,
2347                                                 unsigned long *bitmap))
2348 {
2349         int status;
2350         u16 bg_start_bit;
2351         u64 bg_blkno;
2352         struct ocfs2_dinode *fe;
2353
2354         /* You can't ever have a contiguous set of clusters
2355          * bigger than a block group bitmap so we never have to worry
2356          * about looping on them. */
2357
2358         mlog_entry_void();
2359
2360         /* This is expensive. We can safely remove once this stuff has
2361          * gotten tested really well. */
2362         BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
2363
2364         fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
2365
2366         ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2367                                      &bg_start_bit);
2368
2369         mlog(0, "want to free %u clusters starting at block %llu\n",
2370              num_clusters, (unsigned long long)start_blk);
2371         mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2372              (unsigned long long)bg_blkno, bg_start_bit);
2373
2374         status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2375                                            bg_start_bit, bg_blkno,
2376                                            num_clusters, undo_fn);
2377         if (status < 0) {
2378                 mlog_errno(status);
2379                 goto out;
2380         }
2381
2382         ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2383                                          num_clusters);
2384
2385 out:
2386         mlog_exit(status);
2387         return status;
2388 }
2389
2390 int ocfs2_free_clusters(handle_t *handle,
2391                         struct inode *bitmap_inode,
2392                         struct buffer_head *bitmap_bh,
2393                         u64 start_blk,
2394                         unsigned int num_clusters)
2395 {
2396         return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2397                                     start_blk, num_clusters,
2398                                     _ocfs2_set_bit);
2399 }
2400
2401 /*
2402  * Give never-used clusters back to the global bitmap.  We don't need
2403  * to protect these bits in the undo buffer.
2404  */
2405 int ocfs2_release_clusters(handle_t *handle,
2406                            struct inode *bitmap_inode,
2407                            struct buffer_head *bitmap_bh,
2408                            u64 start_blk,
2409                            unsigned int num_clusters)
2410 {
2411         return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2412                                     start_blk, num_clusters,
2413                                     _ocfs2_clear_bit);
2414 }
2415
2416 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2417 {
2418         printk("Block Group:\n");
2419         printk("bg_signature:       %s\n", bg->bg_signature);
2420         printk("bg_size:            %u\n", bg->bg_size);
2421         printk("bg_bits:            %u\n", bg->bg_bits);
2422         printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
2423         printk("bg_chain:           %u\n", bg->bg_chain);
2424         printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
2425         printk("bg_next_group:      %llu\n",
2426                (unsigned long long)bg->bg_next_group);
2427         printk("bg_parent_dinode:   %llu\n",
2428                (unsigned long long)bg->bg_parent_dinode);
2429         printk("bg_blkno:           %llu\n",
2430                (unsigned long long)bg->bg_blkno);
2431 }
2432
2433 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
2434 {
2435         int i;
2436
2437         printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
2438         printk("i_signature:                  %s\n", fe->i_signature);
2439         printk("i_size:                       %llu\n",
2440                (unsigned long long)fe->i_size);
2441         printk("i_clusters:                   %u\n", fe->i_clusters);
2442         printk("i_generation:                 %u\n",
2443                le32_to_cpu(fe->i_generation));
2444         printk("id1.bitmap1.i_used:           %u\n",
2445                le32_to_cpu(fe->id1.bitmap1.i_used));
2446         printk("id1.bitmap1.i_total:          %u\n",
2447                le32_to_cpu(fe->id1.bitmap1.i_total));
2448         printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
2449         printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
2450         printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
2451         printk("id2.i_chain.cl_next_free_rec: %u\n",
2452                fe->id2.i_chain.cl_next_free_rec);
2453         for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
2454                 printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
2455                        fe->id2.i_chain.cl_recs[i].c_free);
2456                 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
2457                        fe->id2.i_chain.cl_recs[i].c_total);
2458                 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
2459                        (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
2460         }
2461 }
2462
2463 /*
2464  * For a given allocation, determine which allocators will need to be
2465  * accessed, and lock them, reserving the appropriate number of bits.
2466  *
2467  * Sparse file systems call this from ocfs2_write_begin_nolock()
2468  * and ocfs2_allocate_unwritten_extents().
2469  *
2470  * File systems which don't support holes call this from
2471  * ocfs2_extend_allocation().
2472  */
2473 int ocfs2_lock_allocators(struct inode *inode,
2474                           struct ocfs2_extent_tree *et,
2475                           u32 clusters_to_add, u32 extents_to_split,
2476                           struct ocfs2_alloc_context **data_ac,
2477                           struct ocfs2_alloc_context **meta_ac)
2478 {
2479         int ret = 0, num_free_extents;
2480         unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2481         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2482
2483         *meta_ac = NULL;
2484         if (data_ac)
2485                 *data_ac = NULL;
2486
2487         BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2488
2489         num_free_extents = ocfs2_num_free_extents(osb, et);
2490         if (num_free_extents < 0) {
2491                 ret = num_free_extents;
2492                 mlog_errno(ret);
2493                 goto out;
2494         }
2495
2496         /*
2497          * Sparse allocation file systems need to be more conservative
2498          * with reserving room for expansion - the actual allocation
2499          * happens while we've got a journal handle open so re-taking
2500          * a cluster lock (because we ran out of room for another
2501          * extent) will violate ordering rules.
2502          *
2503          * Most of the time we'll only be seeing this 1 cluster at a time
2504          * anyway.
2505          *
2506          * Always lock for any unwritten extents - we might want to
2507          * add blocks during a split.
2508          */
2509         if (!num_free_extents ||
2510             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2511                 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2512                 if (ret < 0) {
2513                         if (ret != -ENOSPC)
2514                                 mlog_errno(ret);
2515                         goto out;
2516                 }
2517         }
2518
2519         if (clusters_to_add == 0)
2520                 goto out;
2521
2522         ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2523         if (ret < 0) {
2524                 if (ret != -ENOSPC)
2525                         mlog_errno(ret);
2526                 goto out;
2527         }
2528
2529 out:
2530         if (ret) {
2531                 if (*meta_ac) {
2532                         ocfs2_free_alloc_context(*meta_ac);
2533                         *meta_ac = NULL;
2534                 }
2535
2536                 /*
2537                  * We cannot have an error and a non null *data_ac.
2538                  */
2539         }
2540
2541         return ret;
2542 }
2543
2544 /*
2545  * Read the inode specified by blkno to get suballoc_slot and
2546  * suballoc_bit.
2547  */
2548 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2549                                        u16 *suballoc_slot, u16 *suballoc_bit)
2550 {
2551         int status;
2552         struct buffer_head *inode_bh = NULL;
2553         struct ocfs2_dinode *inode_fe;
2554
2555         mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
2556
2557         /* dirty read disk */
2558         status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2559         if (status < 0) {
2560                 mlog(ML_ERROR, "read block %llu failed %d\n",
2561                      (unsigned long long)blkno, status);
2562                 goto bail;
2563         }
2564
2565         inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2566         if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2567                 mlog(ML_ERROR, "invalid inode %llu requested\n",
2568                      (unsigned long long)blkno);
2569                 status = -EINVAL;
2570                 goto bail;
2571         }
2572
2573         if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2574             (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2575                 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2576                      (unsigned long long)blkno,
2577                      (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2578                 status = -EINVAL;
2579                 goto bail;
2580         }
2581
2582         if (suballoc_slot)
2583                 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2584         if (suballoc_bit)
2585                 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2586
2587 bail:
2588         brelse(inode_bh);
2589
2590         mlog_exit(status);
2591         return status;
2592 }
2593
2594 /*
2595  * test whether bit is SET in allocator bitmap or not.  on success, 0
2596  * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
2597  * is returned and *res is meaningless.  Call this after you have
2598  * cluster locked against suballoc, or you may get a result based on
2599  * non-up2date contents
2600  */
2601 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2602                                    struct inode *suballoc,
2603                                    struct buffer_head *alloc_bh, u64 blkno,
2604                                    u16 bit, int *res)
2605 {
2606         struct ocfs2_dinode *alloc_fe;
2607         struct ocfs2_group_desc *group;
2608         struct buffer_head *group_bh = NULL;
2609         u64 bg_blkno;
2610         int status;
2611
2612         mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
2613                    (unsigned int)bit);
2614
2615         alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
2616         if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
2617                 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2618                      (unsigned int)bit,
2619                      ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
2620                 status = -EINVAL;
2621                 goto bail;
2622         }
2623
2624         bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2625         status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
2626                                              &group_bh);
2627         if (status < 0) {
2628                 mlog(ML_ERROR, "read group %llu failed %d\n",
2629                      (unsigned long long)bg_blkno, status);
2630                 goto bail;
2631         }
2632
2633         group = (struct ocfs2_group_desc *) group_bh->b_data;
2634         *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2635
2636 bail:
2637         brelse(group_bh);
2638
2639         mlog_exit(status);
2640         return status;
2641 }
2642
2643 /*
2644  * Test if the bit representing this inode (blkno) is set in the
2645  * suballocator.
2646  *
2647  * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2648  *
2649  * In the event of failure, a negative value is returned and *res is
2650  * meaningless.
2651  *
2652  * Callers must make sure to hold nfs_sync_lock to prevent
2653  * ocfs2_delete_inode() on another node from accessing the same
2654  * suballocator concurrently.
2655  */
2656 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2657 {
2658         int status;
2659         u16 suballoc_bit = 0, suballoc_slot = 0;
2660         struct inode *inode_alloc_inode;
2661         struct buffer_head *alloc_bh = NULL;
2662
2663         mlog_entry("blkno: %llu", (unsigned long long)blkno);
2664
2665         status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2666                                              &suballoc_bit);
2667         if (status < 0) {
2668                 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2669                 goto bail;
2670         }
2671
2672         inode_alloc_inode =
2673                 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2674                                             suballoc_slot);
2675         if (!inode_alloc_inode) {
2676                 /* the error code could be inaccurate, but we are not able to
2677                  * get the correct one. */
2678                 status = -EINVAL;
2679                 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2680                      (u32)suballoc_slot);
2681                 goto bail;
2682         }
2683
2684         mutex_lock(&inode_alloc_inode->i_mutex);
2685         status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2686         if (status < 0) {
2687                 mutex_unlock(&inode_alloc_inode->i_mutex);
2688                 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2689                      (u32)suballoc_slot, status);
2690                 goto bail;
2691         }
2692
2693         status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2694                                          blkno, suballoc_bit, res);
2695         if (status < 0)
2696                 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2697
2698         ocfs2_inode_unlock(inode_alloc_inode, 0);
2699         mutex_unlock(&inode_alloc_inode->i_mutex);
2700
2701         iput(inode_alloc_inode);
2702         brelse(alloc_bh);
2703 bail:
2704         mlog_exit(status);
2705         return status;
2706 }