667d622b365991e34fc49931ec1edd749763e218
[safe/jmp/linux-2.6] / fs / ocfs2 / suballoc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "blockcheck.h"
39 #include "dlmglue.h"
40 #include "inode.h"
41 #include "journal.h"
42 #include "localalloc.h"
43 #include "suballoc.h"
44 #include "super.h"
45 #include "sysfile.h"
46 #include "uptodate.h"
47
48 #include "buffer_head_io.h"
49
50 #define NOT_ALLOC_NEW_GROUP             0
51 #define ALLOC_NEW_GROUP                 0x1
52 #define ALLOC_GROUPS_FROM_GLOBAL        0x2
53
54 #define OCFS2_MAX_TO_STEAL              1024
55
56 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
57 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
58 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
59 static int ocfs2_block_group_fill(handle_t *handle,
60                                   struct inode *alloc_inode,
61                                   struct buffer_head *bg_bh,
62                                   u64 group_blkno,
63                                   u16 my_chain,
64                                   struct ocfs2_chain_list *cl);
65 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
66                                    struct inode *alloc_inode,
67                                    struct buffer_head *bh,
68                                    u64 max_block,
69                                    u64 *last_alloc_group,
70                                    int flags);
71
72 static int ocfs2_cluster_group_search(struct inode *inode,
73                                       struct buffer_head *group_bh,
74                                       u32 bits_wanted, u32 min_bits,
75                                       u64 max_block,
76                                       u16 *bit_off, u16 *bits_found);
77 static int ocfs2_block_group_search(struct inode *inode,
78                                     struct buffer_head *group_bh,
79                                     u32 bits_wanted, u32 min_bits,
80                                     u64 max_block,
81                                     u16 *bit_off, u16 *bits_found);
82 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
83                                      struct ocfs2_alloc_context *ac,
84                                      handle_t *handle,
85                                      u32 bits_wanted,
86                                      u32 min_bits,
87                                      u16 *bit_off,
88                                      unsigned int *num_bits,
89                                      u64 *bg_blkno);
90 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
91                                          int nr);
92 static inline int ocfs2_block_group_set_bits(handle_t *handle,
93                                              struct inode *alloc_inode,
94                                              struct ocfs2_group_desc *bg,
95                                              struct buffer_head *group_bh,
96                                              unsigned int bit_off,
97                                              unsigned int num_bits);
98 static int ocfs2_relink_block_group(handle_t *handle,
99                                     struct inode *alloc_inode,
100                                     struct buffer_head *fe_bh,
101                                     struct buffer_head *bg_bh,
102                                     struct buffer_head *prev_bg_bh,
103                                     u16 chain);
104 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
105                                                      u32 wanted);
106 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
107                                                    u64 bg_blkno,
108                                                    u16 bg_bit_off);
109 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
110                                                 u64 data_blkno,
111                                                 u64 *bg_blkno,
112                                                 u16 *bg_bit_off);
113 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
114                                              u32 bits_wanted, u64 max_block,
115                                              int flags,
116                                              struct ocfs2_alloc_context **ac);
117
118 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
119 {
120         struct inode *inode = ac->ac_inode;
121
122         if (inode) {
123                 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
124                         ocfs2_inode_unlock(inode, 1);
125
126                 mutex_unlock(&inode->i_mutex);
127
128                 iput(inode);
129                 ac->ac_inode = NULL;
130         }
131         brelse(ac->ac_bh);
132         ac->ac_bh = NULL;
133         ac->ac_resv = NULL;
134 }
135
136 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
137 {
138         ocfs2_free_ac_resource(ac);
139         kfree(ac);
140 }
141
142 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
143 {
144         return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
145 }
146
147 #define do_error(fmt, ...)                                              \
148         do{                                                             \
149                 if (resize)                                     \
150                         mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
151                 else                                                    \
152                         ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
153         } while (0)
154
155 static int ocfs2_validate_gd_self(struct super_block *sb,
156                                   struct buffer_head *bh,
157                                   int resize)
158 {
159         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
160
161         if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
162                 do_error("Group descriptor #%llu has bad signature %.*s",
163                          (unsigned long long)bh->b_blocknr, 7,
164                          gd->bg_signature);
165                 return -EINVAL;
166         }
167
168         if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
169                 do_error("Group descriptor #%llu has an invalid bg_blkno "
170                          "of %llu",
171                          (unsigned long long)bh->b_blocknr,
172                          (unsigned long long)le64_to_cpu(gd->bg_blkno));
173                 return -EINVAL;
174         }
175
176         if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
177                 do_error("Group descriptor #%llu has an invalid "
178                          "fs_generation of #%u",
179                          (unsigned long long)bh->b_blocknr,
180                          le32_to_cpu(gd->bg_generation));
181                 return -EINVAL;
182         }
183
184         if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
185                 do_error("Group descriptor #%llu has bit count %u but "
186                          "claims that %u are free",
187                          (unsigned long long)bh->b_blocknr,
188                          le16_to_cpu(gd->bg_bits),
189                          le16_to_cpu(gd->bg_free_bits_count));
190                 return -EINVAL;
191         }
192
193         if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
194                 do_error("Group descriptor #%llu has bit count %u but "
195                          "max bitmap bits of %u",
196                          (unsigned long long)bh->b_blocknr,
197                          le16_to_cpu(gd->bg_bits),
198                          8 * le16_to_cpu(gd->bg_size));
199                 return -EINVAL;
200         }
201
202         return 0;
203 }
204
205 static int ocfs2_validate_gd_parent(struct super_block *sb,
206                                     struct ocfs2_dinode *di,
207                                     struct buffer_head *bh,
208                                     int resize)
209 {
210         unsigned int max_bits;
211         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
212
213         if (di->i_blkno != gd->bg_parent_dinode) {
214                 do_error("Group descriptor #%llu has bad parent "
215                          "pointer (%llu, expected %llu)",
216                          (unsigned long long)bh->b_blocknr,
217                          (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
218                          (unsigned long long)le64_to_cpu(di->i_blkno));
219                 return -EINVAL;
220         }
221
222         max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
223         if (le16_to_cpu(gd->bg_bits) > max_bits) {
224                 do_error("Group descriptor #%llu has bit count of %u",
225                          (unsigned long long)bh->b_blocknr,
226                          le16_to_cpu(gd->bg_bits));
227                 return -EINVAL;
228         }
229
230         /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
231         if ((le16_to_cpu(gd->bg_chain) >
232              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
233             ((le16_to_cpu(gd->bg_chain) ==
234              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
235                 do_error("Group descriptor #%llu has bad chain %u",
236                          (unsigned long long)bh->b_blocknr,
237                          le16_to_cpu(gd->bg_chain));
238                 return -EINVAL;
239         }
240
241         return 0;
242 }
243
244 #undef do_error
245
246 /*
247  * This version only prints errors.  It does not fail the filesystem, and
248  * exists only for resize.
249  */
250 int ocfs2_check_group_descriptor(struct super_block *sb,
251                                  struct ocfs2_dinode *di,
252                                  struct buffer_head *bh)
253 {
254         int rc;
255         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
256
257         BUG_ON(!buffer_uptodate(bh));
258
259         /*
260          * If the ecc fails, we return the error but otherwise
261          * leave the filesystem running.  We know any error is
262          * local to this block.
263          */
264         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
265         if (rc) {
266                 mlog(ML_ERROR,
267                      "Checksum failed for group descriptor %llu\n",
268                      (unsigned long long)bh->b_blocknr);
269         } else
270                 rc = ocfs2_validate_gd_self(sb, bh, 1);
271         if (!rc)
272                 rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
273
274         return rc;
275 }
276
277 static int ocfs2_validate_group_descriptor(struct super_block *sb,
278                                            struct buffer_head *bh)
279 {
280         int rc;
281         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
282
283         mlog(0, "Validating group descriptor %llu\n",
284              (unsigned long long)bh->b_blocknr);
285
286         BUG_ON(!buffer_uptodate(bh));
287
288         /*
289          * If the ecc fails, we return the error but otherwise
290          * leave the filesystem running.  We know any error is
291          * local to this block.
292          */
293         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
294         if (rc)
295                 return rc;
296
297         /*
298          * Errors after here are fatal.
299          */
300
301         return ocfs2_validate_gd_self(sb, bh, 0);
302 }
303
304 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
305                                 u64 gd_blkno, struct buffer_head **bh)
306 {
307         int rc;
308         struct buffer_head *tmp = *bh;
309
310         rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
311                               ocfs2_validate_group_descriptor);
312         if (rc)
313                 goto out;
314
315         rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
316         if (rc) {
317                 brelse(tmp);
318                 goto out;
319         }
320
321         /* If ocfs2_read_block() got us a new bh, pass it up. */
322         if (!*bh)
323                 *bh = tmp;
324
325 out:
326         return rc;
327 }
328
329 static int ocfs2_block_group_fill(handle_t *handle,
330                                   struct inode *alloc_inode,
331                                   struct buffer_head *bg_bh,
332                                   u64 group_blkno,
333                                   u16 my_chain,
334                                   struct ocfs2_chain_list *cl)
335 {
336         int status = 0;
337         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
338         struct super_block * sb = alloc_inode->i_sb;
339
340         mlog_entry_void();
341
342         if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
343                 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
344                             "b_blocknr (%llu)",
345                             (unsigned long long)group_blkno,
346                             (unsigned long long) bg_bh->b_blocknr);
347                 status = -EIO;
348                 goto bail;
349         }
350
351         status = ocfs2_journal_access_gd(handle,
352                                          INODE_CACHE(alloc_inode),
353                                          bg_bh,
354                                          OCFS2_JOURNAL_ACCESS_CREATE);
355         if (status < 0) {
356                 mlog_errno(status);
357                 goto bail;
358         }
359
360         memset(bg, 0, sb->s_blocksize);
361         strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
362         bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
363         bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
364         bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
365         bg->bg_chain = cpu_to_le16(my_chain);
366         bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
367         bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
368         bg->bg_blkno = cpu_to_le64(group_blkno);
369         /* set the 1st bit in the bitmap to account for the descriptor block */
370         ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
371         bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
372
373         ocfs2_journal_dirty(handle, bg_bh);
374
375         /* There is no need to zero out or otherwise initialize the
376          * other blocks in a group - All valid FS metadata in a block
377          * group stores the superblock fs_generation value at
378          * allocation time. */
379
380 bail:
381         mlog_exit(status);
382         return status;
383 }
384
385 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
386 {
387         u16 curr, best;
388
389         best = curr = 0;
390         while (curr < le16_to_cpu(cl->cl_count)) {
391                 if (le32_to_cpu(cl->cl_recs[best].c_total) >
392                     le32_to_cpu(cl->cl_recs[curr].c_total))
393                         best = curr;
394                 curr++;
395         }
396         return best;
397 }
398
399 /*
400  * We expect the block group allocator to already be locked.
401  */
402 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
403                                    struct inode *alloc_inode,
404                                    struct buffer_head *bh,
405                                    u64 max_block,
406                                    u64 *last_alloc_group,
407                                    int flags)
408 {
409         int status, credits;
410         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
411         struct ocfs2_chain_list *cl;
412         struct ocfs2_alloc_context *ac = NULL;
413         handle_t *handle = NULL;
414         u32 bit_off, num_bits;
415         u16 alloc_rec;
416         u64 bg_blkno;
417         struct buffer_head *bg_bh = NULL;
418         struct ocfs2_group_desc *bg;
419
420         BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
421
422         mlog_entry_void();
423
424         cl = &fe->id2.i_chain;
425         status = ocfs2_reserve_clusters_with_limit(osb,
426                                                    le16_to_cpu(cl->cl_cpg),
427                                                    max_block, flags, &ac);
428         if (status < 0) {
429                 if (status != -ENOSPC)
430                         mlog_errno(status);
431                 goto bail;
432         }
433
434         credits = ocfs2_calc_group_alloc_credits(osb->sb,
435                                                  le16_to_cpu(cl->cl_cpg));
436         handle = ocfs2_start_trans(osb, credits);
437         if (IS_ERR(handle)) {
438                 status = PTR_ERR(handle);
439                 handle = NULL;
440                 mlog_errno(status);
441                 goto bail;
442         }
443
444         if (last_alloc_group && *last_alloc_group != 0) {
445                 mlog(0, "use old allocation group %llu for block group alloc\n",
446                      (unsigned long long)*last_alloc_group);
447                 ac->ac_last_group = *last_alloc_group;
448         }
449         status = ocfs2_claim_clusters(osb,
450                                       handle,
451                                       ac,
452                                       le16_to_cpu(cl->cl_cpg),
453                                       &bit_off,
454                                       &num_bits);
455         if (status < 0) {
456                 if (status != -ENOSPC)
457                         mlog_errno(status);
458                 goto bail;
459         }
460
461         alloc_rec = ocfs2_find_smallest_chain(cl);
462
463         /* setup the group */
464         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
465         mlog(0, "new descriptor, record %u, at block %llu\n",
466              alloc_rec, (unsigned long long)bg_blkno);
467
468         bg_bh = sb_getblk(osb->sb, bg_blkno);
469         if (!bg_bh) {
470                 status = -EIO;
471                 mlog_errno(status);
472                 goto bail;
473         }
474         ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
475
476         status = ocfs2_block_group_fill(handle,
477                                         alloc_inode,
478                                         bg_bh,
479                                         bg_blkno,
480                                         alloc_rec,
481                                         cl);
482         if (status < 0) {
483                 mlog_errno(status);
484                 goto bail;
485         }
486
487         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
488
489         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
490                                          bh, OCFS2_JOURNAL_ACCESS_WRITE);
491         if (status < 0) {
492                 mlog_errno(status);
493                 goto bail;
494         }
495
496         le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
497                      le16_to_cpu(bg->bg_free_bits_count));
498         le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
499         cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
500         if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
501                 le16_add_cpu(&cl->cl_next_free_rec, 1);
502
503         le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
504                                         le16_to_cpu(bg->bg_free_bits_count));
505         le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
506         le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
507
508         ocfs2_journal_dirty(handle, bh);
509
510         spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
511         OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
512         fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
513                                              le32_to_cpu(fe->i_clusters)));
514         spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
515         i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
516         alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
517
518         status = 0;
519
520         /* save the new last alloc group so that the caller can cache it. */
521         if (last_alloc_group)
522                 *last_alloc_group = ac->ac_last_group;
523
524 bail:
525         if (handle)
526                 ocfs2_commit_trans(osb, handle);
527
528         if (ac)
529                 ocfs2_free_alloc_context(ac);
530
531         brelse(bg_bh);
532
533         mlog_exit(status);
534         return status;
535 }
536
537 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
538                                        struct ocfs2_alloc_context *ac,
539                                        int type,
540                                        u32 slot,
541                                        u64 *last_alloc_group,
542                                        int flags)
543 {
544         int status;
545         u32 bits_wanted = ac->ac_bits_wanted;
546         struct inode *alloc_inode;
547         struct buffer_head *bh = NULL;
548         struct ocfs2_dinode *fe;
549         u32 free_bits;
550
551         mlog_entry_void();
552
553         alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
554         if (!alloc_inode) {
555                 mlog_errno(-EINVAL);
556                 return -EINVAL;
557         }
558
559         mutex_lock(&alloc_inode->i_mutex);
560
561         status = ocfs2_inode_lock(alloc_inode, &bh, 1);
562         if (status < 0) {
563                 mutex_unlock(&alloc_inode->i_mutex);
564                 iput(alloc_inode);
565
566                 mlog_errno(status);
567                 return status;
568         }
569
570         ac->ac_inode = alloc_inode;
571         ac->ac_alloc_slot = slot;
572
573         fe = (struct ocfs2_dinode *) bh->b_data;
574
575         /* The bh was validated by the inode read inside
576          * ocfs2_inode_lock().  Any corruption is a code bug. */
577         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
578
579         if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
580                 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
581                             (unsigned long long)le64_to_cpu(fe->i_blkno));
582                 status = -EIO;
583                 goto bail;
584         }
585
586         free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
587                 le32_to_cpu(fe->id1.bitmap1.i_used);
588
589         if (bits_wanted > free_bits) {
590                 /* cluster bitmap never grows */
591                 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
592                         mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
593                              bits_wanted, free_bits);
594                         status = -ENOSPC;
595                         goto bail;
596                 }
597
598                 if (!(flags & ALLOC_NEW_GROUP)) {
599                         mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
600                              "and we don't alloc a new group for it.\n",
601                              slot, bits_wanted, free_bits);
602                         status = -ENOSPC;
603                         goto bail;
604                 }
605
606                 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
607                                                  ac->ac_max_block,
608                                                  last_alloc_group, flags);
609                 if (status < 0) {
610                         if (status != -ENOSPC)
611                                 mlog_errno(status);
612                         goto bail;
613                 }
614                 atomic_inc(&osb->alloc_stats.bg_extends);
615
616                 /* You should never ask for this much metadata */
617                 BUG_ON(bits_wanted >
618                        (le32_to_cpu(fe->id1.bitmap1.i_total)
619                         - le32_to_cpu(fe->id1.bitmap1.i_used)));
620         }
621
622         get_bh(bh);
623         ac->ac_bh = bh;
624 bail:
625         brelse(bh);
626
627         mlog_exit(status);
628         return status;
629 }
630
631 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
632 {
633         spin_lock(&osb->osb_lock);
634         osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
635         spin_unlock(&osb->osb_lock);
636         atomic_set(&osb->s_num_inodes_stolen, 0);
637 }
638
639 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
640 {
641         spin_lock(&osb->osb_lock);
642         osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
643         spin_unlock(&osb->osb_lock);
644         atomic_set(&osb->s_num_meta_stolen, 0);
645 }
646
647 void ocfs2_init_steal_slots(struct ocfs2_super *osb)
648 {
649         ocfs2_init_inode_steal_slot(osb);
650         ocfs2_init_meta_steal_slot(osb);
651 }
652
653 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
654 {
655         spin_lock(&osb->osb_lock);
656         if (type == INODE_ALLOC_SYSTEM_INODE)
657                 osb->s_inode_steal_slot = slot;
658         else if (type == EXTENT_ALLOC_SYSTEM_INODE)
659                 osb->s_meta_steal_slot = slot;
660         spin_unlock(&osb->osb_lock);
661 }
662
663 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
664 {
665         int slot = OCFS2_INVALID_SLOT;
666
667         spin_lock(&osb->osb_lock);
668         if (type == INODE_ALLOC_SYSTEM_INODE)
669                 slot = osb->s_inode_steal_slot;
670         else if (type == EXTENT_ALLOC_SYSTEM_INODE)
671                 slot = osb->s_meta_steal_slot;
672         spin_unlock(&osb->osb_lock);
673
674         return slot;
675 }
676
677 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
678 {
679         return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
680 }
681
682 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
683 {
684         return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
685 }
686
687 static int ocfs2_steal_resource(struct ocfs2_super *osb,
688                                 struct ocfs2_alloc_context *ac,
689                                 int type)
690 {
691         int i, status = -ENOSPC;
692         int slot = __ocfs2_get_steal_slot(osb, type);
693
694         /* Start to steal resource from the first slot after ours. */
695         if (slot == OCFS2_INVALID_SLOT)
696                 slot = osb->slot_num + 1;
697
698         for (i = 0; i < osb->max_slots; i++, slot++) {
699                 if (slot == osb->max_slots)
700                         slot = 0;
701
702                 if (slot == osb->slot_num)
703                         continue;
704
705                 status = ocfs2_reserve_suballoc_bits(osb, ac,
706                                                      type,
707                                                      (u32)slot, NULL,
708                                                      NOT_ALLOC_NEW_GROUP);
709                 if (status >= 0) {
710                         __ocfs2_set_steal_slot(osb, slot, type);
711                         break;
712                 }
713
714                 ocfs2_free_ac_resource(ac);
715         }
716
717         return status;
718 }
719
720 static int ocfs2_steal_inode(struct ocfs2_super *osb,
721                              struct ocfs2_alloc_context *ac)
722 {
723         return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
724 }
725
726 static int ocfs2_steal_meta(struct ocfs2_super *osb,
727                             struct ocfs2_alloc_context *ac)
728 {
729         return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
730 }
731
732 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
733                                       int blocks,
734                                       struct ocfs2_alloc_context **ac)
735 {
736         int status;
737         int slot = ocfs2_get_meta_steal_slot(osb);
738
739         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
740         if (!(*ac)) {
741                 status = -ENOMEM;
742                 mlog_errno(status);
743                 goto bail;
744         }
745
746         (*ac)->ac_bits_wanted = blocks;
747         (*ac)->ac_which = OCFS2_AC_USE_META;
748         (*ac)->ac_group_search = ocfs2_block_group_search;
749
750         if (slot != OCFS2_INVALID_SLOT &&
751                 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
752                 goto extent_steal;
753
754         atomic_set(&osb->s_num_meta_stolen, 0);
755         status = ocfs2_reserve_suballoc_bits(osb, (*ac),
756                                              EXTENT_ALLOC_SYSTEM_INODE,
757                                              (u32)osb->slot_num, NULL,
758                                              ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
759
760
761         if (status >= 0) {
762                 status = 0;
763                 if (slot != OCFS2_INVALID_SLOT)
764                         ocfs2_init_meta_steal_slot(osb);
765                 goto bail;
766         } else if (status < 0 && status != -ENOSPC) {
767                 mlog_errno(status);
768                 goto bail;
769         }
770
771         ocfs2_free_ac_resource(*ac);
772
773 extent_steal:
774         status = ocfs2_steal_meta(osb, *ac);
775         atomic_inc(&osb->s_num_meta_stolen);
776         if (status < 0) {
777                 if (status != -ENOSPC)
778                         mlog_errno(status);
779                 goto bail;
780         }
781
782         status = 0;
783 bail:
784         if ((status < 0) && *ac) {
785                 ocfs2_free_alloc_context(*ac);
786                 *ac = NULL;
787         }
788
789         mlog_exit(status);
790         return status;
791 }
792
793 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
794                                struct ocfs2_extent_list *root_el,
795                                struct ocfs2_alloc_context **ac)
796 {
797         return ocfs2_reserve_new_metadata_blocks(osb,
798                                         ocfs2_extend_meta_needed(root_el),
799                                         ac);
800 }
801
802 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
803                             struct ocfs2_alloc_context **ac)
804 {
805         int status;
806         int slot = ocfs2_get_inode_steal_slot(osb);
807         u64 alloc_group;
808
809         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
810         if (!(*ac)) {
811                 status = -ENOMEM;
812                 mlog_errno(status);
813                 goto bail;
814         }
815
816         (*ac)->ac_bits_wanted = 1;
817         (*ac)->ac_which = OCFS2_AC_USE_INODE;
818
819         (*ac)->ac_group_search = ocfs2_block_group_search;
820
821         /*
822          * stat(2) can't handle i_ino > 32bits, so we tell the
823          * lower levels not to allocate us a block group past that
824          * limit.  The 'inode64' mount option avoids this behavior.
825          */
826         if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
827                 (*ac)->ac_max_block = (u32)~0U;
828
829         /*
830          * slot is set when we successfully steal inode from other nodes.
831          * It is reset in 3 places:
832          * 1. when we flush the truncate log
833          * 2. when we complete local alloc recovery.
834          * 3. when we successfully allocate from our own slot.
835          * After it is set, we will go on stealing inodes until we find the
836          * need to check our slots to see whether there is some space for us.
837          */
838         if (slot != OCFS2_INVALID_SLOT &&
839             atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
840                 goto inode_steal;
841
842         atomic_set(&osb->s_num_inodes_stolen, 0);
843         alloc_group = osb->osb_inode_alloc_group;
844         status = ocfs2_reserve_suballoc_bits(osb, *ac,
845                                              INODE_ALLOC_SYSTEM_INODE,
846                                              (u32)osb->slot_num,
847                                              &alloc_group,
848                                              ALLOC_NEW_GROUP |
849                                              ALLOC_GROUPS_FROM_GLOBAL);
850         if (status >= 0) {
851                 status = 0;
852
853                 spin_lock(&osb->osb_lock);
854                 osb->osb_inode_alloc_group = alloc_group;
855                 spin_unlock(&osb->osb_lock);
856                 mlog(0, "after reservation, new allocation group is "
857                      "%llu\n", (unsigned long long)alloc_group);
858
859                 /*
860                  * Some inodes must be freed by us, so try to allocate
861                  * from our own next time.
862                  */
863                 if (slot != OCFS2_INVALID_SLOT)
864                         ocfs2_init_inode_steal_slot(osb);
865                 goto bail;
866         } else if (status < 0 && status != -ENOSPC) {
867                 mlog_errno(status);
868                 goto bail;
869         }
870
871         ocfs2_free_ac_resource(*ac);
872
873 inode_steal:
874         status = ocfs2_steal_inode(osb, *ac);
875         atomic_inc(&osb->s_num_inodes_stolen);
876         if (status < 0) {
877                 if (status != -ENOSPC)
878                         mlog_errno(status);
879                 goto bail;
880         }
881
882         status = 0;
883 bail:
884         if ((status < 0) && *ac) {
885                 ocfs2_free_alloc_context(*ac);
886                 *ac = NULL;
887         }
888
889         mlog_exit(status);
890         return status;
891 }
892
893 /* local alloc code has to do the same thing, so rather than do this
894  * twice.. */
895 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
896                                       struct ocfs2_alloc_context *ac)
897 {
898         int status;
899
900         ac->ac_which = OCFS2_AC_USE_MAIN;
901         ac->ac_group_search = ocfs2_cluster_group_search;
902
903         status = ocfs2_reserve_suballoc_bits(osb, ac,
904                                              GLOBAL_BITMAP_SYSTEM_INODE,
905                                              OCFS2_INVALID_SLOT, NULL,
906                                              ALLOC_NEW_GROUP);
907         if (status < 0 && status != -ENOSPC) {
908                 mlog_errno(status);
909                 goto bail;
910         }
911
912 bail:
913         return status;
914 }
915
916 /* Callers don't need to care which bitmap (local alloc or main) to
917  * use so we figure it out for them, but unfortunately this clutters
918  * things a bit. */
919 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
920                                              u32 bits_wanted, u64 max_block,
921                                              int flags,
922                                              struct ocfs2_alloc_context **ac)
923 {
924         int status;
925
926         mlog_entry_void();
927
928         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
929         if (!(*ac)) {
930                 status = -ENOMEM;
931                 mlog_errno(status);
932                 goto bail;
933         }
934
935         (*ac)->ac_bits_wanted = bits_wanted;
936         (*ac)->ac_max_block = max_block;
937
938         status = -ENOSPC;
939         if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
940             ocfs2_alloc_should_use_local(osb, bits_wanted)) {
941                 status = ocfs2_reserve_local_alloc_bits(osb,
942                                                         bits_wanted,
943                                                         *ac);
944                 if ((status < 0) && (status != -ENOSPC)) {
945                         mlog_errno(status);
946                         goto bail;
947                 }
948         }
949
950         if (status == -ENOSPC) {
951                 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
952                 if (status < 0) {
953                         if (status != -ENOSPC)
954                                 mlog_errno(status);
955                         goto bail;
956                 }
957         }
958
959         status = 0;
960 bail:
961         if ((status < 0) && *ac) {
962                 ocfs2_free_alloc_context(*ac);
963                 *ac = NULL;
964         }
965
966         mlog_exit(status);
967         return status;
968 }
969
970 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
971                            u32 bits_wanted,
972                            struct ocfs2_alloc_context **ac)
973 {
974         return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
975                                                  ALLOC_NEW_GROUP, ac);
976 }
977
978 /*
979  * More or less lifted from ext3. I'll leave their description below:
980  *
981  * "For ext3 allocations, we must not reuse any blocks which are
982  * allocated in the bitmap buffer's "last committed data" copy.  This
983  * prevents deletes from freeing up the page for reuse until we have
984  * committed the delete transaction.
985  *
986  * If we didn't do this, then deleting something and reallocating it as
987  * data would allow the old block to be overwritten before the
988  * transaction committed (because we force data to disk before commit).
989  * This would lead to corruption if we crashed between overwriting the
990  * data and committing the delete.
991  *
992  * @@@ We may want to make this allocation behaviour conditional on
993  * data-writes at some point, and disable it for metadata allocations or
994  * sync-data inodes."
995  *
996  * Note: OCFS2 already does this differently for metadata vs data
997  * allocations, as those bitmaps are separate and undo access is never
998  * called on a metadata group descriptor.
999  */
1000 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1001                                          int nr)
1002 {
1003         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1004         int ret;
1005
1006         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1007                 return 0;
1008
1009         if (!buffer_jbd(bg_bh))
1010                 return 1;
1011
1012         jbd_lock_bh_state(bg_bh);
1013         bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
1014         if (bg)
1015                 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1016         else
1017                 ret = 1;
1018         jbd_unlock_bh_state(bg_bh);
1019
1020         return ret;
1021 }
1022
1023 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1024                                              struct buffer_head *bg_bh,
1025                                              unsigned int bits_wanted,
1026                                              unsigned int total_bits,
1027                                              u16 *bit_off,
1028                                              u16 *bits_found)
1029 {
1030         void *bitmap;
1031         u16 best_offset, best_size;
1032         int offset, start, found, status = 0;
1033         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1034
1035         /* Callers got this descriptor from
1036          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1037         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1038
1039         found = start = best_offset = best_size = 0;
1040         bitmap = bg->bg_bitmap;
1041
1042         while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
1043                 if (offset == total_bits)
1044                         break;
1045
1046                 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1047                         /* We found a zero, but we can't use it as it
1048                          * hasn't been put to disk yet! */
1049                         found = 0;
1050                         start = offset + 1;
1051                 } else if (offset == start) {
1052                         /* we found a zero */
1053                         found++;
1054                         /* move start to the next bit to test */
1055                         start++;
1056                 } else {
1057                         /* got a zero after some ones */
1058                         found = 1;
1059                         start = offset + 1;
1060                 }
1061                 if (found > best_size) {
1062                         best_size = found;
1063                         best_offset = start - found;
1064                 }
1065                 /* we got everything we needed */
1066                 if (found == bits_wanted) {
1067                         /* mlog(0, "Found it all!\n"); */
1068                         break;
1069                 }
1070         }
1071
1072         /* XXX: I think the first clause is equivalent to the second
1073          *      - jlbec */
1074         if (found == bits_wanted) {
1075                 *bit_off = start - found;
1076                 *bits_found = found;
1077         } else if (best_size) {
1078                 *bit_off = best_offset;
1079                 *bits_found = best_size;
1080         } else {
1081                 status = -ENOSPC;
1082                 /* No error log here -- see the comment above
1083                  * ocfs2_test_bg_bit_allocatable */
1084         }
1085
1086         return status;
1087 }
1088
1089 static inline int ocfs2_block_group_set_bits(handle_t *handle,
1090                                              struct inode *alloc_inode,
1091                                              struct ocfs2_group_desc *bg,
1092                                              struct buffer_head *group_bh,
1093                                              unsigned int bit_off,
1094                                              unsigned int num_bits)
1095 {
1096         int status;
1097         void *bitmap = bg->bg_bitmap;
1098         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1099
1100         mlog_entry_void();
1101
1102         /* All callers get the descriptor via
1103          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1104         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1105         BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1106
1107         mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
1108              num_bits);
1109
1110         if (ocfs2_is_cluster_bitmap(alloc_inode))
1111                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1112
1113         status = ocfs2_journal_access_gd(handle,
1114                                          INODE_CACHE(alloc_inode),
1115                                          group_bh,
1116                                          journal_type);
1117         if (status < 0) {
1118                 mlog_errno(status);
1119                 goto bail;
1120         }
1121
1122         le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1123         while(num_bits--)
1124                 ocfs2_set_bit(bit_off++, bitmap);
1125
1126         ocfs2_journal_dirty(handle, group_bh);
1127
1128 bail:
1129         mlog_exit(status);
1130         return status;
1131 }
1132
1133 /* find the one with the most empty bits */
1134 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1135 {
1136         u16 curr, best;
1137
1138         BUG_ON(!cl->cl_next_free_rec);
1139
1140         best = curr = 0;
1141         while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1142                 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1143                     le32_to_cpu(cl->cl_recs[best].c_free))
1144                         best = curr;
1145                 curr++;
1146         }
1147
1148         BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1149         return best;
1150 }
1151
1152 static int ocfs2_relink_block_group(handle_t *handle,
1153                                     struct inode *alloc_inode,
1154                                     struct buffer_head *fe_bh,
1155                                     struct buffer_head *bg_bh,
1156                                     struct buffer_head *prev_bg_bh,
1157                                     u16 chain)
1158 {
1159         int status;
1160         /* there is a really tiny chance the journal calls could fail,
1161          * but we wouldn't want inconsistent blocks in *any* case. */
1162         u64 fe_ptr, bg_ptr, prev_bg_ptr;
1163         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1164         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1165         struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1166
1167         /* The caller got these descriptors from
1168          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1169         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1170         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1171
1172         mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
1173              (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1174              (unsigned long long)le64_to_cpu(bg->bg_blkno),
1175              (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1176
1177         fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
1178         bg_ptr = le64_to_cpu(bg->bg_next_group);
1179         prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1180
1181         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1182                                          prev_bg_bh,
1183                                          OCFS2_JOURNAL_ACCESS_WRITE);
1184         if (status < 0) {
1185                 mlog_errno(status);
1186                 goto out_rollback;
1187         }
1188
1189         prev_bg->bg_next_group = bg->bg_next_group;
1190         ocfs2_journal_dirty(handle, prev_bg_bh);
1191
1192         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1193                                          bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1194         if (status < 0) {
1195                 mlog_errno(status);
1196                 goto out_rollback;
1197         }
1198
1199         bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1200         ocfs2_journal_dirty(handle, bg_bh);
1201
1202         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1203                                          fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1204         if (status < 0) {
1205                 mlog_errno(status);
1206                 goto out_rollback;
1207         }
1208
1209         fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1210         ocfs2_journal_dirty(handle, fe_bh);
1211
1212 out_rollback:
1213         if (status < 0) {
1214                 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
1215                 bg->bg_next_group = cpu_to_le64(bg_ptr);
1216                 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1217         }
1218
1219         mlog_exit(status);
1220         return status;
1221 }
1222
1223 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1224                                                      u32 wanted)
1225 {
1226         return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1227 }
1228
1229 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1230  * value on error. */
1231 static int ocfs2_cluster_group_search(struct inode *inode,
1232                                       struct buffer_head *group_bh,
1233                                       u32 bits_wanted, u32 min_bits,
1234                                       u64 max_block,
1235                                       u16 *bit_off, u16 *bits_found)
1236 {
1237         int search = -ENOSPC;
1238         int ret;
1239         u64 blkoff;
1240         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1241         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1242         u16 tmp_off, tmp_found;
1243         unsigned int max_bits, gd_cluster_off;
1244
1245         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1246
1247         if (gd->bg_free_bits_count) {
1248                 max_bits = le16_to_cpu(gd->bg_bits);
1249
1250                 /* Tail groups in cluster bitmaps which aren't cpg
1251                  * aligned are prone to partial extention by a failed
1252                  * fs resize. If the file system resize never got to
1253                  * update the dinode cluster count, then we don't want
1254                  * to trust any clusters past it, regardless of what
1255                  * the group descriptor says. */
1256                 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1257                                                           le64_to_cpu(gd->bg_blkno));
1258                 if ((gd_cluster_off + max_bits) >
1259                     OCFS2_I(inode)->ip_clusters) {
1260                         max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1261                         mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
1262                              (unsigned long long)le64_to_cpu(gd->bg_blkno),
1263                              le16_to_cpu(gd->bg_bits),
1264                              OCFS2_I(inode)->ip_clusters, max_bits);
1265                 }
1266
1267                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1268                                                         group_bh, bits_wanted,
1269                                                         max_bits,
1270                                                         &tmp_off, &tmp_found);
1271                 if (ret)
1272                         return ret;
1273
1274                 if (max_block) {
1275                         blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1276                                                           gd_cluster_off +
1277                                                           tmp_off + tmp_found);
1278                         mlog(0, "Checking %llu against %llu\n",
1279                              (unsigned long long)blkoff,
1280                              (unsigned long long)max_block);
1281                         if (blkoff > max_block)
1282                                 return -ENOSPC;
1283                 }
1284
1285                 /* ocfs2_block_group_find_clear_bits() might
1286                  * return success, but we still want to return
1287                  * -ENOSPC unless it found the minimum number
1288                  * of bits. */
1289                 if (min_bits <= tmp_found) {
1290                         *bit_off = tmp_off;
1291                         *bits_found = tmp_found;
1292                         search = 0; /* success */
1293                 } else if (tmp_found) {
1294                         /*
1295                          * Don't show bits which we'll be returning
1296                          * for allocation to the local alloc bitmap.
1297                          */
1298                         ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
1299                 }
1300         }
1301
1302         return search;
1303 }
1304
1305 static int ocfs2_block_group_search(struct inode *inode,
1306                                     struct buffer_head *group_bh,
1307                                     u32 bits_wanted, u32 min_bits,
1308                                     u64 max_block,
1309                                     u16 *bit_off, u16 *bits_found)
1310 {
1311         int ret = -ENOSPC;
1312         u64 blkoff;
1313         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1314
1315         BUG_ON(min_bits != 1);
1316         BUG_ON(ocfs2_is_cluster_bitmap(inode));
1317
1318         if (bg->bg_free_bits_count) {
1319                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1320                                                         group_bh, bits_wanted,
1321                                                         le16_to_cpu(bg->bg_bits),
1322                                                         bit_off, bits_found);
1323                 if (!ret && max_block) {
1324                         blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
1325                                 *bits_found;
1326                         mlog(0, "Checking %llu against %llu\n",
1327                              (unsigned long long)blkoff,
1328                              (unsigned long long)max_block);
1329                         if (blkoff > max_block)
1330                                 ret = -ENOSPC;
1331                 }
1332         }
1333
1334         return ret;
1335 }
1336
1337 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1338                                        handle_t *handle,
1339                                        struct buffer_head *di_bh,
1340                                        u32 num_bits,
1341                                        u16 chain)
1342 {
1343         int ret;
1344         u32 tmp_used;
1345         struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1346         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1347
1348         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1349                                       OCFS2_JOURNAL_ACCESS_WRITE);
1350         if (ret < 0) {
1351                 mlog_errno(ret);
1352                 goto out;
1353         }
1354
1355         tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1356         di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1357         le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1358         ocfs2_journal_dirty(handle, di_bh);
1359
1360 out:
1361         return ret;
1362 }
1363
1364 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1365                                   handle_t *handle,
1366                                   u32 bits_wanted,
1367                                   u32 min_bits,
1368                                   u16 *bit_off,
1369                                   unsigned int *num_bits,
1370                                   u64 gd_blkno,
1371                                   u16 *bits_left)
1372 {
1373         int ret;
1374         u16 found;
1375         struct buffer_head *group_bh = NULL;
1376         struct ocfs2_group_desc *gd;
1377         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1378         struct inode *alloc_inode = ac->ac_inode;
1379
1380         ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
1381                                           &group_bh);
1382         if (ret < 0) {
1383                 mlog_errno(ret);
1384                 return ret;
1385         }
1386
1387         gd = (struct ocfs2_group_desc *) group_bh->b_data;
1388         ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1389                                   ac->ac_max_block, bit_off, &found);
1390         if (ret < 0) {
1391                 if (ret != -ENOSPC)
1392                         mlog_errno(ret);
1393                 goto out;
1394         }
1395
1396         *num_bits = found;
1397
1398         ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1399                                                *num_bits,
1400                                                le16_to_cpu(gd->bg_chain));
1401         if (ret < 0) {
1402                 mlog_errno(ret);
1403                 goto out;
1404         }
1405
1406         ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1407                                          *bit_off, *num_bits);
1408         if (ret < 0)
1409                 mlog_errno(ret);
1410
1411         *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1412
1413 out:
1414         brelse(group_bh);
1415
1416         return ret;
1417 }
1418
1419 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1420                               handle_t *handle,
1421                               u32 bits_wanted,
1422                               u32 min_bits,
1423                               u16 *bit_off,
1424                               unsigned int *num_bits,
1425                               u64 *bg_blkno,
1426                               u16 *bits_left)
1427 {
1428         int status;
1429         u16 chain, tmp_bits;
1430         u32 tmp_used;
1431         u64 next_group;
1432         struct inode *alloc_inode = ac->ac_inode;
1433         struct buffer_head *group_bh = NULL;
1434         struct buffer_head *prev_group_bh = NULL;
1435         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1436         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1437         struct ocfs2_group_desc *bg;
1438
1439         chain = ac->ac_chain;
1440         mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1441              bits_wanted, chain,
1442              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1443
1444         status = ocfs2_read_group_descriptor(alloc_inode, fe,
1445                                              le64_to_cpu(cl->cl_recs[chain].c_blkno),
1446                                              &group_bh);
1447         if (status < 0) {
1448                 mlog_errno(status);
1449                 goto bail;
1450         }
1451         bg = (struct ocfs2_group_desc *) group_bh->b_data;
1452
1453         status = -ENOSPC;
1454         /* for now, the chain search is a bit simplistic. We just use
1455          * the 1st group with any empty bits. */
1456         while ((status = ac->ac_group_search(alloc_inode, group_bh,
1457                                              bits_wanted, min_bits,
1458                                              ac->ac_max_block, bit_off,
1459                                              &tmp_bits)) == -ENOSPC) {
1460                 if (!bg->bg_next_group)
1461                         break;
1462
1463                 brelse(prev_group_bh);
1464                 prev_group_bh = NULL;
1465
1466                 next_group = le64_to_cpu(bg->bg_next_group);
1467                 prev_group_bh = group_bh;
1468                 group_bh = NULL;
1469                 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1470                                                      next_group, &group_bh);
1471                 if (status < 0) {
1472                         mlog_errno(status);
1473                         goto bail;
1474                 }
1475                 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1476         }
1477         if (status < 0) {
1478                 if (status != -ENOSPC)
1479                         mlog_errno(status);
1480                 goto bail;
1481         }
1482
1483         mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1484              tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1485
1486         *num_bits = tmp_bits;
1487
1488         BUG_ON(*num_bits == 0);
1489
1490         /*
1491          * Keep track of previous block descriptor read. When
1492          * we find a target, if we have read more than X
1493          * number of descriptors, and the target is reasonably
1494          * empty, relink him to top of his chain.
1495          *
1496          * We've read 0 extra blocks and only send one more to
1497          * the transaction, yet the next guy to search has a
1498          * much easier time.
1499          *
1500          * Do this *after* figuring out how many bits we're taking out
1501          * of our target group.
1502          */
1503         if (ac->ac_allow_chain_relink &&
1504             (prev_group_bh) &&
1505             (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1506                 status = ocfs2_relink_block_group(handle, alloc_inode,
1507                                                   ac->ac_bh, group_bh,
1508                                                   prev_group_bh, chain);
1509                 if (status < 0) {
1510                         mlog_errno(status);
1511                         goto bail;
1512                 }
1513         }
1514
1515         /* Ok, claim our bits now: set the info on dinode, chainlist
1516          * and then the group */
1517         status = ocfs2_journal_access_di(handle,
1518                                          INODE_CACHE(alloc_inode),
1519                                          ac->ac_bh,
1520                                          OCFS2_JOURNAL_ACCESS_WRITE);
1521         if (status < 0) {
1522                 mlog_errno(status);
1523                 goto bail;
1524         }
1525
1526         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1527         fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1528         le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1529         ocfs2_journal_dirty(handle, ac->ac_bh);
1530
1531         status = ocfs2_block_group_set_bits(handle,
1532                                             alloc_inode,
1533                                             bg,
1534                                             group_bh,
1535                                             *bit_off,
1536                                             *num_bits);
1537         if (status < 0) {
1538                 mlog_errno(status);
1539                 goto bail;
1540         }
1541
1542         mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
1543              (unsigned long long)le64_to_cpu(fe->i_blkno));
1544
1545         *bg_blkno = le64_to_cpu(bg->bg_blkno);
1546         *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1547 bail:
1548         brelse(group_bh);
1549         brelse(prev_group_bh);
1550
1551         mlog_exit(status);
1552         return status;
1553 }
1554
1555 /* will give out up to bits_wanted contiguous bits. */
1556 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1557                                      struct ocfs2_alloc_context *ac,
1558                                      handle_t *handle,
1559                                      u32 bits_wanted,
1560                                      u32 min_bits,
1561                                      u16 *bit_off,
1562                                      unsigned int *num_bits,
1563                                      u64 *bg_blkno)
1564 {
1565         int status;
1566         u16 victim, i;
1567         u16 bits_left = 0;
1568         u64 hint_blkno = ac->ac_last_group;
1569         struct ocfs2_chain_list *cl;
1570         struct ocfs2_dinode *fe;
1571
1572         mlog_entry_void();
1573
1574         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1575         BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1576         BUG_ON(!ac->ac_bh);
1577
1578         fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1579
1580         /* The bh was validated by the inode read during
1581          * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1582         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1583
1584         if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1585             le32_to_cpu(fe->id1.bitmap1.i_total)) {
1586                 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
1587                             "bits but only %u total.",
1588                             (unsigned long long)le64_to_cpu(fe->i_blkno),
1589                             le32_to_cpu(fe->id1.bitmap1.i_used),
1590                             le32_to_cpu(fe->id1.bitmap1.i_total));
1591                 status = -EIO;
1592                 goto bail;
1593         }
1594
1595         if (hint_blkno) {
1596                 /* Attempt to short-circuit the usual search mechanism
1597                  * by jumping straight to the most recently used
1598                  * allocation group. This helps us mantain some
1599                  * contiguousness across allocations. */
1600                 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1601                                                 min_bits, bit_off, num_bits,
1602                                                 hint_blkno, &bits_left);
1603                 if (!status) {
1604                         /* Be careful to update *bg_blkno here as the
1605                          * caller is expecting it to be filled in, and
1606                          * ocfs2_search_one_group() won't do that for
1607                          * us. */
1608                         *bg_blkno = hint_blkno;
1609                         goto set_hint;
1610                 }
1611                 if (status < 0 && status != -ENOSPC) {
1612                         mlog_errno(status);
1613                         goto bail;
1614                 }
1615         }
1616
1617         cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1618
1619         victim = ocfs2_find_victim_chain(cl);
1620         ac->ac_chain = victim;
1621         ac->ac_allow_chain_relink = 1;
1622
1623         status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
1624                                     num_bits, bg_blkno, &bits_left);
1625         if (!status)
1626                 goto set_hint;
1627         if (status < 0 && status != -ENOSPC) {
1628                 mlog_errno(status);
1629                 goto bail;
1630         }
1631
1632         mlog(0, "Search of victim chain %u came up with nothing, "
1633              "trying all chains now.\n", victim);
1634
1635         /* If we didn't pick a good victim, then just default to
1636          * searching each chain in order. Don't allow chain relinking
1637          * because we only calculate enough journal credits for one
1638          * relink per alloc. */
1639         ac->ac_allow_chain_relink = 0;
1640         for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1641                 if (i == victim)
1642                         continue;
1643                 if (!cl->cl_recs[i].c_free)
1644                         continue;
1645
1646                 ac->ac_chain = i;
1647                 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1648                                             bit_off, num_bits, bg_blkno,
1649                                             &bits_left);
1650                 if (!status)
1651                         break;
1652                 if (status < 0 && status != -ENOSPC) {
1653                         mlog_errno(status);
1654                         goto bail;
1655                 }
1656         }
1657
1658 set_hint:
1659         if (status != -ENOSPC) {
1660                 /* If the next search of this group is not likely to
1661                  * yield a suitable extent, then we reset the last
1662                  * group hint so as to not waste a disk read */
1663                 if (bits_left < min_bits)
1664                         ac->ac_last_group = 0;
1665                 else
1666                         ac->ac_last_group = *bg_blkno;
1667         }
1668
1669 bail:
1670         mlog_exit(status);
1671         return status;
1672 }
1673
1674 int ocfs2_claim_metadata(struct ocfs2_super *osb,
1675                          handle_t *handle,
1676                          struct ocfs2_alloc_context *ac,
1677                          u32 bits_wanted,
1678                          u16 *suballoc_bit_start,
1679                          unsigned int *num_bits,
1680                          u64 *blkno_start)
1681 {
1682         int status;
1683         u64 bg_blkno;
1684
1685         BUG_ON(!ac);
1686         BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1687         BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1688
1689         status = ocfs2_claim_suballoc_bits(osb,
1690                                            ac,
1691                                            handle,
1692                                            bits_wanted,
1693                                            1,
1694                                            suballoc_bit_start,
1695                                            num_bits,
1696                                            &bg_blkno);
1697         if (status < 0) {
1698                 mlog_errno(status);
1699                 goto bail;
1700         }
1701         atomic_inc(&osb->alloc_stats.bg_allocs);
1702
1703         *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1704         ac->ac_bits_given += (*num_bits);
1705         status = 0;
1706 bail:
1707         mlog_exit(status);
1708         return status;
1709 }
1710
1711 static void ocfs2_init_inode_ac_group(struct inode *dir,
1712                                       struct buffer_head *parent_fe_bh,
1713                                       struct ocfs2_alloc_context *ac)
1714 {
1715         struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
1716         /*
1717          * Try to allocate inodes from some specific group.
1718          *
1719          * If the parent dir has recorded the last group used in allocation,
1720          * cool, use it. Otherwise if we try to allocate new inode from the
1721          * same slot the parent dir belongs to, use the same chunk.
1722          *
1723          * We are very careful here to avoid the mistake of setting
1724          * ac_last_group to a group descriptor from a different (unlocked) slot.
1725          */
1726         if (OCFS2_I(dir)->ip_last_used_group &&
1727             OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
1728                 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
1729         else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
1730                 ac->ac_last_group = ocfs2_which_suballoc_group(
1731                                         le64_to_cpu(fe->i_blkno),
1732                                         le16_to_cpu(fe->i_suballoc_bit));
1733 }
1734
1735 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
1736                                              struct ocfs2_alloc_context *ac)
1737 {
1738         OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
1739         OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
1740 }
1741
1742 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1743                           handle_t *handle,
1744                           struct inode *dir,
1745                           struct buffer_head *parent_fe_bh,
1746                           struct ocfs2_alloc_context *ac,
1747                           u16 *suballoc_bit,
1748                           u64 *fe_blkno)
1749 {
1750         int status;
1751         unsigned int num_bits;
1752         u64 bg_blkno;
1753
1754         mlog_entry_void();
1755
1756         BUG_ON(!ac);
1757         BUG_ON(ac->ac_bits_given != 0);
1758         BUG_ON(ac->ac_bits_wanted != 1);
1759         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1760
1761         ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
1762
1763         status = ocfs2_claim_suballoc_bits(osb,
1764                                            ac,
1765                                            handle,
1766                                            1,
1767                                            1,
1768                                            suballoc_bit,
1769                                            &num_bits,
1770                                            &bg_blkno);
1771         if (status < 0) {
1772                 mlog_errno(status);
1773                 goto bail;
1774         }
1775         atomic_inc(&osb->alloc_stats.bg_allocs);
1776
1777         BUG_ON(num_bits != 1);
1778
1779         *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1780         ac->ac_bits_given++;
1781         ocfs2_save_inode_ac_group(dir, ac);
1782         status = 0;
1783 bail:
1784         mlog_exit(status);
1785         return status;
1786 }
1787
1788 /* translate a group desc. blkno and it's bitmap offset into
1789  * disk cluster offset. */
1790 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1791                                                    u64 bg_blkno,
1792                                                    u16 bg_bit_off)
1793 {
1794         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1795         u32 cluster = 0;
1796
1797         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1798
1799         if (bg_blkno != osb->first_cluster_group_blkno)
1800                 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1801         cluster += (u32) bg_bit_off;
1802         return cluster;
1803 }
1804
1805 /* given a cluster offset, calculate which block group it belongs to
1806  * and return that block offset. */
1807 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
1808 {
1809         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1810         u32 group_no;
1811
1812         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1813
1814         group_no = cluster / osb->bitmap_cpg;
1815         if (!group_no)
1816                 return osb->first_cluster_group_blkno;
1817         return ocfs2_clusters_to_blocks(inode->i_sb,
1818                                         group_no * osb->bitmap_cpg);
1819 }
1820
1821 /* given the block number of a cluster start, calculate which cluster
1822  * group and descriptor bitmap offset that corresponds to. */
1823 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1824                                                 u64 data_blkno,
1825                                                 u64 *bg_blkno,
1826                                                 u16 *bg_bit_off)
1827 {
1828         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1829         u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1830
1831         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1832
1833         *bg_blkno = ocfs2_which_cluster_group(inode,
1834                                               data_cluster);
1835
1836         if (*bg_blkno == osb->first_cluster_group_blkno)
1837                 *bg_bit_off = (u16) data_cluster;
1838         else
1839                 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1840                                                              data_blkno - *bg_blkno);
1841 }
1842
1843 /*
1844  * min_bits - minimum contiguous chunk from this total allocation we
1845  * can handle. set to what we asked for originally for a full
1846  * contig. allocation, set to '1' to indicate we can deal with extents
1847  * of any size.
1848  */
1849 int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1850                            handle_t *handle,
1851                            struct ocfs2_alloc_context *ac,
1852                            u32 min_clusters,
1853                            u32 max_clusters,
1854                            u32 *cluster_start,
1855                            u32 *num_clusters)
1856 {
1857         int status;
1858         unsigned int bits_wanted = max_clusters;
1859         u64 bg_blkno = 0;
1860         u16 bg_bit_off;
1861
1862         mlog_entry_void();
1863
1864         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1865
1866         BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1867                && ac->ac_which != OCFS2_AC_USE_MAIN);
1868
1869         if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1870                 WARN_ON(min_clusters > 1);
1871
1872                 status = ocfs2_claim_local_alloc_bits(osb,
1873                                                       handle,
1874                                                       ac,
1875                                                       bits_wanted,
1876                                                       cluster_start,
1877                                                       num_clusters);
1878                 if (!status)
1879                         atomic_inc(&osb->alloc_stats.local_data);
1880         } else {
1881                 if (min_clusters > (osb->bitmap_cpg - 1)) {
1882                         /* The only paths asking for contiguousness
1883                          * should know about this already. */
1884                         mlog(ML_ERROR, "minimum allocation requested %u exceeds "
1885                              "group bitmap size %u!\n", min_clusters,
1886                              osb->bitmap_cpg);
1887                         status = -ENOSPC;
1888                         goto bail;
1889                 }
1890                 /* clamp the current request down to a realistic size. */
1891                 if (bits_wanted > (osb->bitmap_cpg - 1))
1892                         bits_wanted = osb->bitmap_cpg - 1;
1893
1894                 status = ocfs2_claim_suballoc_bits(osb,
1895                                                    ac,
1896                                                    handle,
1897                                                    bits_wanted,
1898                                                    min_clusters,
1899                                                    &bg_bit_off,
1900                                                    num_clusters,
1901                                                    &bg_blkno);
1902                 if (!status) {
1903                         *cluster_start =
1904                                 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1905                                                                  bg_blkno,
1906                                                                  bg_bit_off);
1907                         atomic_inc(&osb->alloc_stats.bitmap_data);
1908                 }
1909         }
1910         if (status < 0) {
1911                 if (status != -ENOSPC)
1912                         mlog_errno(status);
1913                 goto bail;
1914         }
1915
1916         ac->ac_bits_given += *num_clusters;
1917
1918 bail:
1919         mlog_exit(status);
1920         return status;
1921 }
1922
1923 int ocfs2_claim_clusters(struct ocfs2_super *osb,
1924                          handle_t *handle,
1925                          struct ocfs2_alloc_context *ac,
1926                          u32 min_clusters,
1927                          u32 *cluster_start,
1928                          u32 *num_clusters)
1929 {
1930         unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1931
1932         return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
1933                                       bits_wanted, cluster_start, num_clusters);
1934 }
1935
1936 static int ocfs2_block_group_clear_bits(handle_t *handle,
1937                                         struct inode *alloc_inode,
1938                                         struct ocfs2_group_desc *bg,
1939                                         struct buffer_head *group_bh,
1940                                         unsigned int bit_off,
1941                                         unsigned int num_bits,
1942                                         void (*undo_fn)(unsigned int bit,
1943                                                         unsigned long *bmap))
1944 {
1945         int status;
1946         unsigned int tmp;
1947         struct ocfs2_group_desc *undo_bg = NULL;
1948
1949         mlog_entry_void();
1950
1951         /* The caller got this descriptor from
1952          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1953         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1954
1955         mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1956
1957         BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
1958         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1959                                          group_bh,
1960                                          undo_fn ?
1961                                          OCFS2_JOURNAL_ACCESS_UNDO :
1962                                          OCFS2_JOURNAL_ACCESS_WRITE);
1963         if (status < 0) {
1964                 mlog_errno(status);
1965                 goto bail;
1966         }
1967
1968         if (undo_fn) {
1969                 jbd_lock_bh_state(group_bh);
1970                 undo_bg = (struct ocfs2_group_desc *)
1971                                         bh2jh(group_bh)->b_committed_data;
1972                 BUG_ON(!undo_bg);
1973         }
1974
1975         tmp = num_bits;
1976         while(tmp--) {
1977                 ocfs2_clear_bit((bit_off + tmp),
1978                                 (unsigned long *) bg->bg_bitmap);
1979                 if (undo_fn)
1980                         undo_fn(bit_off + tmp,
1981                                 (unsigned long *) undo_bg->bg_bitmap);
1982         }
1983         le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1984
1985         if (undo_fn)
1986                 jbd_unlock_bh_state(group_bh);
1987
1988         ocfs2_journal_dirty(handle, group_bh);
1989 bail:
1990         return status;
1991 }
1992
1993 /*
1994  * expects the suballoc inode to already be locked.
1995  */
1996 static int _ocfs2_free_suballoc_bits(handle_t *handle,
1997                                      struct inode *alloc_inode,
1998                                      struct buffer_head *alloc_bh,
1999                                      unsigned int start_bit,
2000                                      u64 bg_blkno,
2001                                      unsigned int count,
2002                                      void (*undo_fn)(unsigned int bit,
2003                                                      unsigned long *bitmap))
2004 {
2005         int status = 0;
2006         u32 tmp_used;
2007         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2008         struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2009         struct buffer_head *group_bh = NULL;
2010         struct ocfs2_group_desc *group;
2011
2012         mlog_entry_void();
2013
2014         /* The alloc_bh comes from ocfs2_free_dinode() or
2015          * ocfs2_free_clusters().  The callers have all locked the
2016          * allocator and gotten alloc_bh from the lock call.  This
2017          * validates the dinode buffer.  Any corruption that has happended
2018          * is a code bug. */
2019         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2020         BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2021
2022         mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
2023              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
2024              (unsigned long long)bg_blkno, start_bit);
2025
2026         status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2027                                              &group_bh);
2028         if (status < 0) {
2029                 mlog_errno(status);
2030                 goto bail;
2031         }
2032         group = (struct ocfs2_group_desc *) group_bh->b_data;
2033
2034         BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2035
2036         status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2037                                               group, group_bh,
2038                                               start_bit, count, undo_fn);
2039         if (status < 0) {
2040                 mlog_errno(status);
2041                 goto bail;
2042         }
2043
2044         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2045                                          alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2046         if (status < 0) {
2047                 mlog_errno(status);
2048                 goto bail;
2049         }
2050
2051         le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
2052                      count);
2053         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2054         fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2055         ocfs2_journal_dirty(handle, alloc_bh);
2056
2057 bail:
2058         brelse(group_bh);
2059
2060         mlog_exit(status);
2061         return status;
2062 }
2063
2064 int ocfs2_free_suballoc_bits(handle_t *handle,
2065                              struct inode *alloc_inode,
2066                              struct buffer_head *alloc_bh,
2067                              unsigned int start_bit,
2068                              u64 bg_blkno,
2069                              unsigned int count)
2070 {
2071         return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2072                                          start_bit, bg_blkno, count, NULL);
2073 }
2074
2075 int ocfs2_free_dinode(handle_t *handle,
2076                       struct inode *inode_alloc_inode,
2077                       struct buffer_head *inode_alloc_bh,
2078                       struct ocfs2_dinode *di)
2079 {
2080         u64 blk = le64_to_cpu(di->i_blkno);
2081         u16 bit = le16_to_cpu(di->i_suballoc_bit);
2082         u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2083
2084         return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2085                                         inode_alloc_bh, bit, bg_blkno, 1);
2086 }
2087
2088 static int _ocfs2_free_clusters(handle_t *handle,
2089                                 struct inode *bitmap_inode,
2090                                 struct buffer_head *bitmap_bh,
2091                                 u64 start_blk,
2092                                 unsigned int num_clusters,
2093                                 void (*undo_fn)(unsigned int bit,
2094                                                 unsigned long *bitmap))
2095 {
2096         int status;
2097         u16 bg_start_bit;
2098         u64 bg_blkno;
2099         struct ocfs2_dinode *fe;
2100
2101         /* You can't ever have a contiguous set of clusters
2102          * bigger than a block group bitmap so we never have to worry
2103          * about looping on them. */
2104
2105         mlog_entry_void();
2106
2107         /* This is expensive. We can safely remove once this stuff has
2108          * gotten tested really well. */
2109         BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
2110
2111         fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
2112
2113         ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2114                                      &bg_start_bit);
2115
2116         mlog(0, "want to free %u clusters starting at block %llu\n",
2117              num_clusters, (unsigned long long)start_blk);
2118         mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2119              (unsigned long long)bg_blkno, bg_start_bit);
2120
2121         status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2122                                            bg_start_bit, bg_blkno,
2123                                            num_clusters, undo_fn);
2124         if (status < 0) {
2125                 mlog_errno(status);
2126                 goto out;
2127         }
2128
2129         ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2130                                          num_clusters);
2131
2132 out:
2133         mlog_exit(status);
2134         return status;
2135 }
2136
2137 int ocfs2_free_clusters(handle_t *handle,
2138                         struct inode *bitmap_inode,
2139                         struct buffer_head *bitmap_bh,
2140                         u64 start_blk,
2141                         unsigned int num_clusters)
2142 {
2143         return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2144                                     start_blk, num_clusters,
2145                                     _ocfs2_set_bit);
2146 }
2147
2148 /*
2149  * Give never-used clusters back to the global bitmap.  We don't need
2150  * to protect these bits in the undo buffer.
2151  */
2152 int ocfs2_release_clusters(handle_t *handle,
2153                            struct inode *bitmap_inode,
2154                            struct buffer_head *bitmap_bh,
2155                            u64 start_blk,
2156                            unsigned int num_clusters)
2157 {
2158         return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2159                                     start_blk, num_clusters,
2160                                     _ocfs2_clear_bit);
2161 }
2162
2163 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2164 {
2165         printk("Block Group:\n");
2166         printk("bg_signature:       %s\n", bg->bg_signature);
2167         printk("bg_size:            %u\n", bg->bg_size);
2168         printk("bg_bits:            %u\n", bg->bg_bits);
2169         printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
2170         printk("bg_chain:           %u\n", bg->bg_chain);
2171         printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
2172         printk("bg_next_group:      %llu\n",
2173                (unsigned long long)bg->bg_next_group);
2174         printk("bg_parent_dinode:   %llu\n",
2175                (unsigned long long)bg->bg_parent_dinode);
2176         printk("bg_blkno:           %llu\n",
2177                (unsigned long long)bg->bg_blkno);
2178 }
2179
2180 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
2181 {
2182         int i;
2183
2184         printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
2185         printk("i_signature:                  %s\n", fe->i_signature);
2186         printk("i_size:                       %llu\n",
2187                (unsigned long long)fe->i_size);
2188         printk("i_clusters:                   %u\n", fe->i_clusters);
2189         printk("i_generation:                 %u\n",
2190                le32_to_cpu(fe->i_generation));
2191         printk("id1.bitmap1.i_used:           %u\n",
2192                le32_to_cpu(fe->id1.bitmap1.i_used));
2193         printk("id1.bitmap1.i_total:          %u\n",
2194                le32_to_cpu(fe->id1.bitmap1.i_total));
2195         printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
2196         printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
2197         printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
2198         printk("id2.i_chain.cl_next_free_rec: %u\n",
2199                fe->id2.i_chain.cl_next_free_rec);
2200         for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
2201                 printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
2202                        fe->id2.i_chain.cl_recs[i].c_free);
2203                 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
2204                        fe->id2.i_chain.cl_recs[i].c_total);
2205                 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
2206                        (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
2207         }
2208 }
2209
2210 /*
2211  * For a given allocation, determine which allocators will need to be
2212  * accessed, and lock them, reserving the appropriate number of bits.
2213  *
2214  * Sparse file systems call this from ocfs2_write_begin_nolock()
2215  * and ocfs2_allocate_unwritten_extents().
2216  *
2217  * File systems which don't support holes call this from
2218  * ocfs2_extend_allocation().
2219  */
2220 int ocfs2_lock_allocators(struct inode *inode,
2221                           struct ocfs2_extent_tree *et,
2222                           u32 clusters_to_add, u32 extents_to_split,
2223                           struct ocfs2_alloc_context **data_ac,
2224                           struct ocfs2_alloc_context **meta_ac)
2225 {
2226         int ret = 0, num_free_extents;
2227         unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2228         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2229
2230         *meta_ac = NULL;
2231         if (data_ac)
2232                 *data_ac = NULL;
2233
2234         BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2235
2236         num_free_extents = ocfs2_num_free_extents(osb, et);
2237         if (num_free_extents < 0) {
2238                 ret = num_free_extents;
2239                 mlog_errno(ret);
2240                 goto out;
2241         }
2242
2243         /*
2244          * Sparse allocation file systems need to be more conservative
2245          * with reserving room for expansion - the actual allocation
2246          * happens while we've got a journal handle open so re-taking
2247          * a cluster lock (because we ran out of room for another
2248          * extent) will violate ordering rules.
2249          *
2250          * Most of the time we'll only be seeing this 1 cluster at a time
2251          * anyway.
2252          *
2253          * Always lock for any unwritten extents - we might want to
2254          * add blocks during a split.
2255          */
2256         if (!num_free_extents ||
2257             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2258                 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2259                 if (ret < 0) {
2260                         if (ret != -ENOSPC)
2261                                 mlog_errno(ret);
2262                         goto out;
2263                 }
2264         }
2265
2266         if (clusters_to_add == 0)
2267                 goto out;
2268
2269         ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2270         if (ret < 0) {
2271                 if (ret != -ENOSPC)
2272                         mlog_errno(ret);
2273                 goto out;
2274         }
2275
2276 out:
2277         if (ret) {
2278                 if (*meta_ac) {
2279                         ocfs2_free_alloc_context(*meta_ac);
2280                         *meta_ac = NULL;
2281                 }
2282
2283                 /*
2284                  * We cannot have an error and a non null *data_ac.
2285                  */
2286         }
2287
2288         return ret;
2289 }
2290
2291 /*
2292  * Read the inode specified by blkno to get suballoc_slot and
2293  * suballoc_bit.
2294  */
2295 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2296                                        u16 *suballoc_slot, u16 *suballoc_bit)
2297 {
2298         int status;
2299         struct buffer_head *inode_bh = NULL;
2300         struct ocfs2_dinode *inode_fe;
2301
2302         mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
2303
2304         /* dirty read disk */
2305         status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2306         if (status < 0) {
2307                 mlog(ML_ERROR, "read block %llu failed %d\n",
2308                      (unsigned long long)blkno, status);
2309                 goto bail;
2310         }
2311
2312         inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2313         if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2314                 mlog(ML_ERROR, "invalid inode %llu requested\n",
2315                      (unsigned long long)blkno);
2316                 status = -EINVAL;
2317                 goto bail;
2318         }
2319
2320         if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2321             (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2322                 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2323                      (unsigned long long)blkno,
2324                      (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2325                 status = -EINVAL;
2326                 goto bail;
2327         }
2328
2329         if (suballoc_slot)
2330                 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2331         if (suballoc_bit)
2332                 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2333
2334 bail:
2335         brelse(inode_bh);
2336
2337         mlog_exit(status);
2338         return status;
2339 }
2340
2341 /*
2342  * test whether bit is SET in allocator bitmap or not.  on success, 0
2343  * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
2344  * is returned and *res is meaningless.  Call this after you have
2345  * cluster locked against suballoc, or you may get a result based on
2346  * non-up2date contents
2347  */
2348 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2349                                    struct inode *suballoc,
2350                                    struct buffer_head *alloc_bh, u64 blkno,
2351                                    u16 bit, int *res)
2352 {
2353         struct ocfs2_dinode *alloc_fe;
2354         struct ocfs2_group_desc *group;
2355         struct buffer_head *group_bh = NULL;
2356         u64 bg_blkno;
2357         int status;
2358
2359         mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
2360                    (unsigned int)bit);
2361
2362         alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
2363         if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
2364                 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2365                      (unsigned int)bit,
2366                      ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
2367                 status = -EINVAL;
2368                 goto bail;
2369         }
2370
2371         bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2372         status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
2373                                              &group_bh);
2374         if (status < 0) {
2375                 mlog(ML_ERROR, "read group %llu failed %d\n",
2376                      (unsigned long long)bg_blkno, status);
2377                 goto bail;
2378         }
2379
2380         group = (struct ocfs2_group_desc *) group_bh->b_data;
2381         *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2382
2383 bail:
2384         brelse(group_bh);
2385
2386         mlog_exit(status);
2387         return status;
2388 }
2389
2390 /*
2391  * Test if the bit representing this inode (blkno) is set in the
2392  * suballocator.
2393  *
2394  * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2395  *
2396  * In the event of failure, a negative value is returned and *res is
2397  * meaningless.
2398  *
2399  * Callers must make sure to hold nfs_sync_lock to prevent
2400  * ocfs2_delete_inode() on another node from accessing the same
2401  * suballocator concurrently.
2402  */
2403 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2404 {
2405         int status;
2406         u16 suballoc_bit = 0, suballoc_slot = 0;
2407         struct inode *inode_alloc_inode;
2408         struct buffer_head *alloc_bh = NULL;
2409
2410         mlog_entry("blkno: %llu", (unsigned long long)blkno);
2411
2412         status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2413                                              &suballoc_bit);
2414         if (status < 0) {
2415                 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2416                 goto bail;
2417         }
2418
2419         inode_alloc_inode =
2420                 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2421                                             suballoc_slot);
2422         if (!inode_alloc_inode) {
2423                 /* the error code could be inaccurate, but we are not able to
2424                  * get the correct one. */
2425                 status = -EINVAL;
2426                 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2427                      (u32)suballoc_slot);
2428                 goto bail;
2429         }
2430
2431         mutex_lock(&inode_alloc_inode->i_mutex);
2432         status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2433         if (status < 0) {
2434                 mutex_unlock(&inode_alloc_inode->i_mutex);
2435                 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2436                      (u32)suballoc_slot, status);
2437                 goto bail;
2438         }
2439
2440         status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2441                                          blkno, suballoc_bit, res);
2442         if (status < 0)
2443                 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2444
2445         ocfs2_inode_unlock(inode_alloc_inode, 0);
2446         mutex_unlock(&inode_alloc_inode->i_mutex);
2447
2448         iput(inode_alloc_inode);
2449         brelse(alloc_bh);
2450 bail:
2451         mlog_exit(status);
2452         return status;
2453 }