ocfs2: Add refcount tree lock mechanism.
[safe/jmp/linux-2.6] / fs / ocfs2 / refcounttree.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * refcounttree.c
5  *
6  * Copyright (C) 2009 Oracle.  All rights reserved.
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public
10  * License version 2 as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License for more details.
16  */
17
18 #define MLOG_MASK_PREFIX ML_REFCOUNT
19 #include <cluster/masklog.h>
20 #include "ocfs2.h"
21 #include "inode.h"
22 #include "alloc.h"
23 #include "suballoc.h"
24 #include "journal.h"
25 #include "uptodate.h"
26 #include "super.h"
27 #include "buffer_head_io.h"
28 #include "blockcheck.h"
29 #include "refcounttree.h"
30 #include "dlmglue.h"
31
32 static inline struct ocfs2_refcount_tree *
33 cache_info_to_refcount(struct ocfs2_caching_info *ci)
34 {
35         return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
36 }
37
38 static int ocfs2_validate_refcount_block(struct super_block *sb,
39                                          struct buffer_head *bh)
40 {
41         int rc;
42         struct ocfs2_refcount_block *rb =
43                 (struct ocfs2_refcount_block *)bh->b_data;
44
45         mlog(0, "Validating refcount block %llu\n",
46              (unsigned long long)bh->b_blocknr);
47
48         BUG_ON(!buffer_uptodate(bh));
49
50         /*
51          * If the ecc fails, we return the error but otherwise
52          * leave the filesystem running.  We know any error is
53          * local to this block.
54          */
55         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
56         if (rc) {
57                 mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
58                      (unsigned long long)bh->b_blocknr);
59                 return rc;
60         }
61
62
63         if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
64                 ocfs2_error(sb,
65                             "Refcount block #%llu has bad signature %.*s",
66                             (unsigned long long)bh->b_blocknr, 7,
67                             rb->rf_signature);
68                 return -EINVAL;
69         }
70
71         if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
72                 ocfs2_error(sb,
73                             "Refcount block #%llu has an invalid rf_blkno "
74                             "of %llu",
75                             (unsigned long long)bh->b_blocknr,
76                             (unsigned long long)le64_to_cpu(rb->rf_blkno));
77                 return -EINVAL;
78         }
79
80         if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
81                 ocfs2_error(sb,
82                             "Refcount block #%llu has an invalid "
83                             "rf_fs_generation of #%u",
84                             (unsigned long long)bh->b_blocknr,
85                             le32_to_cpu(rb->rf_fs_generation));
86                 return -EINVAL;
87         }
88
89         return 0;
90 }
91
92 static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
93                                      u64 rb_blkno,
94                                      struct buffer_head **bh)
95 {
96         int rc;
97         struct buffer_head *tmp = *bh;
98
99         rc = ocfs2_read_block(ci, rb_blkno, &tmp,
100                               ocfs2_validate_refcount_block);
101
102         /* If ocfs2_read_block() got us a new bh, pass it up. */
103         if (!rc && !*bh)
104                 *bh = tmp;
105
106         return rc;
107 }
108
109 static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
110 {
111         struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
112
113         return rf->rf_blkno;
114 }
115
116 static struct super_block *
117 ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
118 {
119         struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
120
121         return rf->rf_sb;
122 }
123
124 static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
125 {
126         struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
127
128         spin_lock(&rf->rf_lock);
129 }
130
131 static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
132 {
133         struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
134
135         spin_unlock(&rf->rf_lock);
136 }
137
138 static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
139 {
140         struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
141
142         mutex_lock(&rf->rf_io_mutex);
143 }
144
145 static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
146 {
147         struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
148
149         mutex_unlock(&rf->rf_io_mutex);
150 }
151
152 static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
153         .co_owner               = ocfs2_refcount_cache_owner,
154         .co_get_super           = ocfs2_refcount_cache_get_super,
155         .co_cache_lock          = ocfs2_refcount_cache_lock,
156         .co_cache_unlock        = ocfs2_refcount_cache_unlock,
157         .co_io_lock             = ocfs2_refcount_cache_io_lock,
158         .co_io_unlock           = ocfs2_refcount_cache_io_unlock,
159 };
160
161 static struct ocfs2_refcount_tree *
162 ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
163 {
164         struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
165         struct ocfs2_refcount_tree *tree = NULL;
166
167         while (n) {
168                 tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
169
170                 if (blkno < tree->rf_blkno)
171                         n = n->rb_left;
172                 else if (blkno > tree->rf_blkno)
173                         n = n->rb_right;
174                 else
175                         return tree;
176         }
177
178         return NULL;
179 }
180
181 /* osb_lock is already locked. */
182 static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
183                                        struct ocfs2_refcount_tree *new)
184 {
185         u64 rf_blkno = new->rf_blkno;
186         struct rb_node *parent = NULL;
187         struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
188         struct ocfs2_refcount_tree *tmp;
189
190         while (*p) {
191                 parent = *p;
192
193                 tmp = rb_entry(parent, struct ocfs2_refcount_tree,
194                                rf_node);
195
196                 if (rf_blkno < tmp->rf_blkno)
197                         p = &(*p)->rb_left;
198                 else if (rf_blkno > tmp->rf_blkno)
199                         p = &(*p)->rb_right;
200                 else {
201                         /* This should never happen! */
202                         mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
203                              (unsigned long long)rf_blkno);
204                         BUG();
205                 }
206         }
207
208         rb_link_node(&new->rf_node, parent, p);
209         rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
210 }
211
212 static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
213 {
214         ocfs2_metadata_cache_exit(&tree->rf_ci);
215         ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
216         ocfs2_lock_res_free(&tree->rf_lockres);
217         kfree(tree);
218 }
219
220 static inline void
221 ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
222                                         struct ocfs2_refcount_tree *tree)
223 {
224         rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
225         if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
226                 osb->osb_ref_tree_lru = NULL;
227 }
228
229 static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
230                                         struct ocfs2_refcount_tree *tree)
231 {
232         spin_lock(&osb->osb_lock);
233         ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
234         spin_unlock(&osb->osb_lock);
235 }
236
237 void ocfs2_kref_remove_refcount_tree(struct kref *kref)
238 {
239         struct ocfs2_refcount_tree *tree =
240                 container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
241
242         ocfs2_free_refcount_tree(tree);
243 }
244
245 static inline void
246 ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
247 {
248         kref_get(&tree->rf_getcnt);
249 }
250
251 static inline void
252 ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
253 {
254         kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
255 }
256
257 static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
258                                                struct super_block *sb)
259 {
260         ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
261         mutex_init(&new->rf_io_mutex);
262         new->rf_sb = sb;
263         spin_lock_init(&new->rf_lock);
264 }
265
266 static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
267                                         struct ocfs2_refcount_tree *new,
268                                         u64 rf_blkno, u32 generation)
269 {
270         init_rwsem(&new->rf_sem);
271         ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
272                                      rf_blkno, generation);
273 }
274
275 static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
276                                    struct ocfs2_refcount_tree **ret_tree)
277 {
278         int ret = 0;
279         struct ocfs2_refcount_tree *tree, *new = NULL;
280         struct buffer_head *ref_root_bh = NULL;
281         struct ocfs2_refcount_block *ref_rb;
282
283         spin_lock(&osb->osb_lock);
284         if (osb->osb_ref_tree_lru &&
285             osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
286                 tree = osb->osb_ref_tree_lru;
287         else
288                 tree = ocfs2_find_refcount_tree(osb, rf_blkno);
289         if (tree)
290                 goto out;
291
292         spin_unlock(&osb->osb_lock);
293
294         new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
295         if (!new) {
296                 ret = -ENOMEM;
297                 return ret;
298         }
299
300         new->rf_blkno = rf_blkno;
301         kref_init(&new->rf_getcnt);
302         ocfs2_init_refcount_tree_ci(new, osb->sb);
303
304         /*
305          * We need the generation to create the refcount tree lock and since
306          * it isn't changed during the tree modification, we are safe here to
307          * read without protection.
308          * We also have to purge the cache after we create the lock since the
309          * refcount block may have the stale data. It can only be trusted when
310          * we hold the refcount lock.
311          */
312         ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
313         if (ret) {
314                 mlog_errno(ret);
315                 ocfs2_metadata_cache_exit(&new->rf_ci);
316                 kfree(new);
317                 return ret;
318         }
319
320         ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
321         new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
322         ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
323                                       new->rf_generation);
324         ocfs2_metadata_cache_purge(&new->rf_ci);
325
326         spin_lock(&osb->osb_lock);
327         tree = ocfs2_find_refcount_tree(osb, rf_blkno);
328         if (tree)
329                 goto out;
330
331         ocfs2_insert_refcount_tree(osb, new);
332
333         tree = new;
334         new = NULL;
335
336 out:
337         *ret_tree = tree;
338
339         osb->osb_ref_tree_lru = tree;
340
341         spin_unlock(&osb->osb_lock);
342
343         if (new)
344                 ocfs2_free_refcount_tree(new);
345
346         brelse(ref_root_bh);
347         return ret;
348 }
349
350 static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
351 {
352         int ret;
353         struct buffer_head *di_bh = NULL;
354         struct ocfs2_dinode *di;
355
356         ret = ocfs2_read_inode_block(inode, &di_bh);
357         if (ret) {
358                 mlog_errno(ret);
359                 goto out;
360         }
361
362         BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
363
364         di = (struct ocfs2_dinode *)di_bh->b_data;
365         *ref_blkno = le64_to_cpu(di->i_refcount_loc);
366         brelse(di_bh);
367 out:
368         return ret;
369 }
370
371 static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
372                                       struct ocfs2_refcount_tree *tree, int rw)
373 {
374         int ret;
375
376         ret = ocfs2_refcount_lock(tree, rw);
377         if (ret) {
378                 mlog_errno(ret);
379                 goto out;
380         }
381
382         if (rw)
383                 down_write(&tree->rf_sem);
384         else
385                 down_read(&tree->rf_sem);
386
387 out:
388         return ret;
389 }
390
391 /*
392  * Lock the refcount tree pointed by ref_blkno and return the tree.
393  * In most case, we lock the tree and read the refcount block.
394  * So read it here if the caller really needs it.
395  *
396  * If the tree has been re-created by other node, it will free the
397  * old one and re-create it.
398  */
399 int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
400                              u64 ref_blkno, int rw,
401                              struct ocfs2_refcount_tree **ret_tree,
402                              struct buffer_head **ref_bh)
403 {
404         int ret, delete_tree = 0;
405         struct ocfs2_refcount_tree *tree = NULL;
406         struct buffer_head *ref_root_bh = NULL;
407         struct ocfs2_refcount_block *rb;
408
409 again:
410         ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
411         if (ret) {
412                 mlog_errno(ret);
413                 return ret;
414         }
415
416         ocfs2_refcount_tree_get(tree);
417
418         ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
419         if (ret) {
420                 mlog_errno(ret);
421                 ocfs2_refcount_tree_put(tree);
422                 goto out;
423         }
424
425         ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
426                                         &ref_root_bh);
427         if (ret) {
428                 mlog_errno(ret);
429                 ocfs2_unlock_refcount_tree(osb, tree, rw);
430                 ocfs2_refcount_tree_put(tree);
431                 goto out;
432         }
433
434         rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
435         /*
436          * If the refcount block has been freed and re-created, we may need
437          * to recreate the refcount tree also.
438          *
439          * Here we just remove the tree from the rb-tree, and the last
440          * kref holder will unlock and delete this refcount_tree.
441          * Then we goto "again" and ocfs2_get_refcount_tree will create
442          * the new refcount tree for us.
443          */
444         if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
445                 if (!tree->rf_removed) {
446                         ocfs2_erase_refcount_tree_from_list(osb, tree);
447                         tree->rf_removed = 1;
448                         delete_tree = 1;
449                 }
450
451                 ocfs2_unlock_refcount_tree(osb, tree, rw);
452                 /*
453                  * We get an extra reference when we create the refcount
454                  * tree, so another put will destroy it.
455                  */
456                 if (delete_tree)
457                         ocfs2_refcount_tree_put(tree);
458                 brelse(ref_root_bh);
459                 ref_root_bh = NULL;
460                 goto again;
461         }
462
463         *ret_tree = tree;
464         if (ref_bh) {
465                 *ref_bh = ref_root_bh;
466                 ref_root_bh = NULL;
467         }
468 out:
469         brelse(ref_root_bh);
470         return ret;
471 }
472
473 int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw,
474                                       struct ocfs2_refcount_tree **ret_tree,
475                                       struct buffer_head **ref_bh)
476 {
477         int ret;
478         u64 ref_blkno;
479
480         ret = ocfs2_get_refcount_block(inode, &ref_blkno);
481         if (ret) {
482                 mlog_errno(ret);
483                 return ret;
484         }
485
486         return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno,
487                                         rw, ret_tree, ref_bh);
488 }
489
490 void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
491                                 struct ocfs2_refcount_tree *tree, int rw)
492 {
493         if (rw)
494                 up_write(&tree->rf_sem);
495         else
496                 up_read(&tree->rf_sem);
497
498         ocfs2_refcount_unlock(tree, rw);
499         ocfs2_refcount_tree_put(tree);
500 }
501
502 void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
503 {
504         struct rb_node *node;
505         struct ocfs2_refcount_tree *tree;
506         struct rb_root *root = &osb->osb_rf_lock_tree;
507
508         while ((node = rb_last(root)) != NULL) {
509                 tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
510
511                 mlog(0, "Purge tree %llu\n",
512                      (unsigned long long) tree->rf_blkno);
513
514                 rb_erase(&tree->rf_node, root);
515                 ocfs2_free_refcount_tree(tree);
516         }
517 }