1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
6 * Copyright (C) 2009 Oracle. All rights reserved.
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
18 #define MLOG_MASK_PREFIX ML_REFCOUNT
19 #include <cluster/masklog.h>
27 #include "buffer_head_io.h"
28 #include "blockcheck.h"
29 #include "refcounttree.h"
33 static inline struct ocfs2_refcount_tree *
34 cache_info_to_refcount(struct ocfs2_caching_info *ci)
36 return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
39 static int ocfs2_validate_refcount_block(struct super_block *sb,
40 struct buffer_head *bh)
43 struct ocfs2_refcount_block *rb =
44 (struct ocfs2_refcount_block *)bh->b_data;
46 mlog(0, "Validating refcount block %llu\n",
47 (unsigned long long)bh->b_blocknr);
49 BUG_ON(!buffer_uptodate(bh));
52 * If the ecc fails, we return the error but otherwise
53 * leave the filesystem running. We know any error is
54 * local to this block.
56 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
58 mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
59 (unsigned long long)bh->b_blocknr);
64 if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
66 "Refcount block #%llu has bad signature %.*s",
67 (unsigned long long)bh->b_blocknr, 7,
72 if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
74 "Refcount block #%llu has an invalid rf_blkno "
76 (unsigned long long)bh->b_blocknr,
77 (unsigned long long)le64_to_cpu(rb->rf_blkno));
81 if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
83 "Refcount block #%llu has an invalid "
84 "rf_fs_generation of #%u",
85 (unsigned long long)bh->b_blocknr,
86 le32_to_cpu(rb->rf_fs_generation));
93 static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
95 struct buffer_head **bh)
98 struct buffer_head *tmp = *bh;
100 rc = ocfs2_read_block(ci, rb_blkno, &tmp,
101 ocfs2_validate_refcount_block);
103 /* If ocfs2_read_block() got us a new bh, pass it up. */
110 static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
112 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
117 static struct super_block *
118 ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
120 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
125 static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
127 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
129 spin_lock(&rf->rf_lock);
132 static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
134 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
136 spin_unlock(&rf->rf_lock);
139 static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
141 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
143 mutex_lock(&rf->rf_io_mutex);
146 static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
148 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
150 mutex_unlock(&rf->rf_io_mutex);
153 static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
154 .co_owner = ocfs2_refcount_cache_owner,
155 .co_get_super = ocfs2_refcount_cache_get_super,
156 .co_cache_lock = ocfs2_refcount_cache_lock,
157 .co_cache_unlock = ocfs2_refcount_cache_unlock,
158 .co_io_lock = ocfs2_refcount_cache_io_lock,
159 .co_io_unlock = ocfs2_refcount_cache_io_unlock,
162 static struct ocfs2_refcount_tree *
163 ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
165 struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
166 struct ocfs2_refcount_tree *tree = NULL;
169 tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
171 if (blkno < tree->rf_blkno)
173 else if (blkno > tree->rf_blkno)
182 /* osb_lock is already locked. */
183 static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
184 struct ocfs2_refcount_tree *new)
186 u64 rf_blkno = new->rf_blkno;
187 struct rb_node *parent = NULL;
188 struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
189 struct ocfs2_refcount_tree *tmp;
194 tmp = rb_entry(parent, struct ocfs2_refcount_tree,
197 if (rf_blkno < tmp->rf_blkno)
199 else if (rf_blkno > tmp->rf_blkno)
202 /* This should never happen! */
203 mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
204 (unsigned long long)rf_blkno);
209 rb_link_node(&new->rf_node, parent, p);
210 rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
213 static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
215 ocfs2_metadata_cache_exit(&tree->rf_ci);
216 ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
217 ocfs2_lock_res_free(&tree->rf_lockres);
222 ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
223 struct ocfs2_refcount_tree *tree)
225 rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
226 if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
227 osb->osb_ref_tree_lru = NULL;
230 static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
231 struct ocfs2_refcount_tree *tree)
233 spin_lock(&osb->osb_lock);
234 ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
235 spin_unlock(&osb->osb_lock);
238 void ocfs2_kref_remove_refcount_tree(struct kref *kref)
240 struct ocfs2_refcount_tree *tree =
241 container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
243 ocfs2_free_refcount_tree(tree);
247 ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
249 kref_get(&tree->rf_getcnt);
253 ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
255 kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
258 static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
259 struct super_block *sb)
261 ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
262 mutex_init(&new->rf_io_mutex);
264 spin_lock_init(&new->rf_lock);
267 static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
268 struct ocfs2_refcount_tree *new,
269 u64 rf_blkno, u32 generation)
271 init_rwsem(&new->rf_sem);
272 ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
273 rf_blkno, generation);
276 static struct ocfs2_refcount_tree*
277 ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
279 struct ocfs2_refcount_tree *new;
281 new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
285 new->rf_blkno = rf_blkno;
286 kref_init(&new->rf_getcnt);
287 ocfs2_init_refcount_tree_ci(new, osb->sb);
292 static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
293 struct ocfs2_refcount_tree **ret_tree)
296 struct ocfs2_refcount_tree *tree, *new = NULL;
297 struct buffer_head *ref_root_bh = NULL;
298 struct ocfs2_refcount_block *ref_rb;
300 spin_lock(&osb->osb_lock);
301 if (osb->osb_ref_tree_lru &&
302 osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
303 tree = osb->osb_ref_tree_lru;
305 tree = ocfs2_find_refcount_tree(osb, rf_blkno);
309 spin_unlock(&osb->osb_lock);
311 new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
318 * We need the generation to create the refcount tree lock and since
319 * it isn't changed during the tree modification, we are safe here to
320 * read without protection.
321 * We also have to purge the cache after we create the lock since the
322 * refcount block may have the stale data. It can only be trusted when
323 * we hold the refcount lock.
325 ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
328 ocfs2_metadata_cache_exit(&new->rf_ci);
333 ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
334 new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
335 ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
337 ocfs2_metadata_cache_purge(&new->rf_ci);
339 spin_lock(&osb->osb_lock);
340 tree = ocfs2_find_refcount_tree(osb, rf_blkno);
344 ocfs2_insert_refcount_tree(osb, new);
352 osb->osb_ref_tree_lru = tree;
354 spin_unlock(&osb->osb_lock);
357 ocfs2_free_refcount_tree(new);
363 static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
366 struct buffer_head *di_bh = NULL;
367 struct ocfs2_dinode *di;
369 ret = ocfs2_read_inode_block(inode, &di_bh);
375 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
377 di = (struct ocfs2_dinode *)di_bh->b_data;
378 *ref_blkno = le64_to_cpu(di->i_refcount_loc);
384 static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
385 struct ocfs2_refcount_tree *tree, int rw)
389 ret = ocfs2_refcount_lock(tree, rw);
396 down_write(&tree->rf_sem);
398 down_read(&tree->rf_sem);
405 * Lock the refcount tree pointed by ref_blkno and return the tree.
406 * In most case, we lock the tree and read the refcount block.
407 * So read it here if the caller really needs it.
409 * If the tree has been re-created by other node, it will free the
410 * old one and re-create it.
412 int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
413 u64 ref_blkno, int rw,
414 struct ocfs2_refcount_tree **ret_tree,
415 struct buffer_head **ref_bh)
417 int ret, delete_tree = 0;
418 struct ocfs2_refcount_tree *tree = NULL;
419 struct buffer_head *ref_root_bh = NULL;
420 struct ocfs2_refcount_block *rb;
423 ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
429 ocfs2_refcount_tree_get(tree);
431 ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
434 ocfs2_refcount_tree_put(tree);
438 ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
442 ocfs2_unlock_refcount_tree(osb, tree, rw);
443 ocfs2_refcount_tree_put(tree);
447 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
449 * If the refcount block has been freed and re-created, we may need
450 * to recreate the refcount tree also.
452 * Here we just remove the tree from the rb-tree, and the last
453 * kref holder will unlock and delete this refcount_tree.
454 * Then we goto "again" and ocfs2_get_refcount_tree will create
455 * the new refcount tree for us.
457 if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
458 if (!tree->rf_removed) {
459 ocfs2_erase_refcount_tree_from_list(osb, tree);
460 tree->rf_removed = 1;
464 ocfs2_unlock_refcount_tree(osb, tree, rw);
466 * We get an extra reference when we create the refcount
467 * tree, so another put will destroy it.
470 ocfs2_refcount_tree_put(tree);
478 *ref_bh = ref_root_bh;
486 int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw,
487 struct ocfs2_refcount_tree **ret_tree,
488 struct buffer_head **ref_bh)
493 ret = ocfs2_get_refcount_block(inode, &ref_blkno);
499 return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno,
500 rw, ret_tree, ref_bh);
503 void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
504 struct ocfs2_refcount_tree *tree, int rw)
507 up_write(&tree->rf_sem);
509 up_read(&tree->rf_sem);
511 ocfs2_refcount_unlock(tree, rw);
512 ocfs2_refcount_tree_put(tree);
515 void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
517 struct rb_node *node;
518 struct ocfs2_refcount_tree *tree;
519 struct rb_root *root = &osb->osb_rf_lock_tree;
521 while ((node = rb_last(root)) != NULL) {
522 tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
524 mlog(0, "Purge tree %llu\n",
525 (unsigned long long) tree->rf_blkno);
527 rb_erase(&tree->rf_node, root);
528 ocfs2_free_refcount_tree(tree);
533 * Create a refcount tree for an inode.
534 * We take for granted that the inode is already locked.
536 static int ocfs2_create_refcount_tree(struct inode *inode,
537 struct buffer_head *di_bh)
540 handle_t *handle = NULL;
541 struct ocfs2_alloc_context *meta_ac = NULL;
542 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
543 struct ocfs2_inode_info *oi = OCFS2_I(inode);
544 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
545 struct buffer_head *new_bh = NULL;
546 struct ocfs2_refcount_block *rb;
547 struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
548 u16 suballoc_bit_start;
552 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
554 mlog(0, "create tree for inode %lu\n", inode->i_ino);
556 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
562 handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
563 if (IS_ERR(handle)) {
564 ret = PTR_ERR(handle);
569 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
570 OCFS2_JOURNAL_ACCESS_WRITE);
576 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
577 &suballoc_bit_start, &num_got,
584 new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
591 new_bh = sb_getblk(inode->i_sb, first_blkno);
592 ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
594 ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
595 OCFS2_JOURNAL_ACCESS_CREATE);
601 /* Initialize ocfs2_refcount_block. */
602 rb = (struct ocfs2_refcount_block *)new_bh->b_data;
603 memset(rb, 0, inode->i_sb->s_blocksize);
604 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
605 rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num);
606 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
607 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
608 rb->rf_blkno = cpu_to_le64(first_blkno);
609 rb->rf_count = cpu_to_le32(1);
610 rb->rf_records.rl_count =
611 cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
612 spin_lock(&osb->osb_lock);
613 rb->rf_generation = osb->s_next_generation++;
614 spin_unlock(&osb->osb_lock);
616 ocfs2_journal_dirty(handle, new_bh);
618 spin_lock(&oi->ip_lock);
619 oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
620 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
621 di->i_refcount_loc = cpu_to_le64(first_blkno);
622 spin_unlock(&oi->ip_lock);
624 mlog(0, "created tree for inode %lu, refblock %llu\n",
625 inode->i_ino, (unsigned long long)first_blkno);
627 ocfs2_journal_dirty(handle, di_bh);
630 * We have to init the tree lock here since it will use
631 * the generation number to create it.
633 new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
634 ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
635 new_tree->rf_generation);
637 spin_lock(&osb->osb_lock);
638 tree = ocfs2_find_refcount_tree(osb, first_blkno);
641 * We've just created a new refcount tree in this block. If
642 * we found a refcount tree on the ocfs2_super, it must be
643 * one we just deleted. We free the old tree before
644 * inserting the new tree.
646 BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
648 ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
649 ocfs2_insert_refcount_tree(osb, new_tree);
650 spin_unlock(&osb->osb_lock);
653 ocfs2_refcount_tree_put(tree);
656 ocfs2_commit_trans(osb, handle);
660 ocfs2_metadata_cache_exit(&new_tree->rf_ci);
666 ocfs2_free_alloc_context(meta_ac);
671 static int ocfs2_set_refcount_tree(struct inode *inode,
672 struct buffer_head *di_bh,
676 handle_t *handle = NULL;
677 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
678 struct ocfs2_inode_info *oi = OCFS2_I(inode);
679 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
680 struct buffer_head *ref_root_bh = NULL;
681 struct ocfs2_refcount_block *rb;
682 struct ocfs2_refcount_tree *ref_tree;
684 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
686 ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
687 &ref_tree, &ref_root_bh);
693 handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
694 if (IS_ERR(handle)) {
695 ret = PTR_ERR(handle);
700 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
701 OCFS2_JOURNAL_ACCESS_WRITE);
707 ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
708 OCFS2_JOURNAL_ACCESS_WRITE);
714 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
715 le32_add_cpu(&rb->rf_count, 1);
717 ocfs2_journal_dirty(handle, ref_root_bh);
719 spin_lock(&oi->ip_lock);
720 oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
721 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
722 di->i_refcount_loc = cpu_to_le64(refcount_loc);
723 spin_unlock(&oi->ip_lock);
724 ocfs2_journal_dirty(handle, di_bh);
727 ocfs2_commit_trans(osb, handle);
729 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
735 int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
737 int ret, delete_tree = 0;
738 handle_t *handle = NULL;
739 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
740 struct ocfs2_inode_info *oi = OCFS2_I(inode);
741 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
742 struct ocfs2_refcount_block *rb;
743 struct inode *alloc_inode = NULL;
744 struct buffer_head *alloc_bh = NULL;
745 struct buffer_head *blk_bh = NULL;
746 struct ocfs2_refcount_tree *ref_tree;
747 int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS;
748 u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
751 if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
755 ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
761 rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
764 * If we are the last user, we need to free the block.
765 * So lock the allocator ahead.
767 if (le32_to_cpu(rb->rf_count) == 1) {
768 blk = le64_to_cpu(rb->rf_blkno);
769 bit = le16_to_cpu(rb->rf_suballoc_bit);
770 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
772 alloc_inode = ocfs2_get_system_file_inode(osb,
773 EXTENT_ALLOC_SYSTEM_INODE,
774 le16_to_cpu(rb->rf_suballoc_slot));
780 mutex_lock(&alloc_inode->i_mutex);
782 ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
788 credits += OCFS2_SUBALLOC_FREE;
791 handle = ocfs2_start_trans(osb, credits);
792 if (IS_ERR(handle)) {
793 ret = PTR_ERR(handle);
798 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
799 OCFS2_JOURNAL_ACCESS_WRITE);
805 ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
806 OCFS2_JOURNAL_ACCESS_WRITE);
812 spin_lock(&oi->ip_lock);
813 oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
814 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
815 di->i_refcount_loc = 0;
816 spin_unlock(&oi->ip_lock);
817 ocfs2_journal_dirty(handle, di_bh);
819 le32_add_cpu(&rb->rf_count , -1);
820 ocfs2_journal_dirty(handle, blk_bh);
824 ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
825 ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
826 alloc_bh, bit, bg_blkno, 1);
832 ocfs2_commit_trans(osb, handle);
835 ocfs2_inode_unlock(alloc_inode, 1);
840 mutex_unlock(&alloc_inode->i_mutex);
844 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
846 ocfs2_refcount_tree_put(ref_tree);