ocfs2: clean up localalloc mount option size parsing
[safe/jmp/linux-2.6] / fs / ocfs2 / refcounttree.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * refcounttree.c
5  *
6  * Copyright (C) 2009 Oracle.  All rights reserved.
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public
10  * License version 2 as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License for more details.
16  */
17
18 #include <linux/sort.h>
19 #define MLOG_MASK_PREFIX ML_REFCOUNT
20 #include <cluster/masklog.h>
21 #include "ocfs2.h"
22 #include "inode.h"
23 #include "alloc.h"
24 #include "suballoc.h"
25 #include "journal.h"
26 #include "uptodate.h"
27 #include "super.h"
28 #include "buffer_head_io.h"
29 #include "blockcheck.h"
30 #include "refcounttree.h"
31 #include "sysfile.h"
32 #include "dlmglue.h"
33 #include "extent_map.h"
34 #include "aops.h"
35 #include "xattr.h"
36 #include "namei.h"
37
38 #include <linux/bio.h>
39 #include <linux/blkdev.h>
40 #include <linux/gfp.h>
41 #include <linux/slab.h>
42 #include <linux/writeback.h>
43 #include <linux/pagevec.h>
44 #include <linux/swap.h>
45 #include <linux/security.h>
46 #include <linux/fsnotify.h>
47 #include <linux/quotaops.h>
48 #include <linux/namei.h>
49 #include <linux/mount.h>
50
51 struct ocfs2_cow_context {
52         struct inode *inode;
53         u32 cow_start;
54         u32 cow_len;
55         struct ocfs2_extent_tree data_et;
56         struct ocfs2_refcount_tree *ref_tree;
57         struct buffer_head *ref_root_bh;
58         struct ocfs2_alloc_context *meta_ac;
59         struct ocfs2_alloc_context *data_ac;
60         struct ocfs2_cached_dealloc_ctxt dealloc;
61         void *cow_object;
62         struct ocfs2_post_refcount *post_refcount;
63         int extra_credits;
64         int (*get_clusters)(struct ocfs2_cow_context *context,
65                             u32 v_cluster, u32 *p_cluster,
66                             u32 *num_clusters,
67                             unsigned int *extent_flags);
68         int (*cow_duplicate_clusters)(handle_t *handle,
69                                       struct ocfs2_cow_context *context,
70                                       u32 cpos, u32 old_cluster,
71                                       u32 new_cluster, u32 new_len);
72 };
73
74 static inline struct ocfs2_refcount_tree *
75 cache_info_to_refcount(struct ocfs2_caching_info *ci)
76 {
77         return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
78 }
79
80 static int ocfs2_validate_refcount_block(struct super_block *sb,
81                                          struct buffer_head *bh)
82 {
83         int rc;
84         struct ocfs2_refcount_block *rb =
85                 (struct ocfs2_refcount_block *)bh->b_data;
86
87         mlog(0, "Validating refcount block %llu\n",
88              (unsigned long long)bh->b_blocknr);
89
90         BUG_ON(!buffer_uptodate(bh));
91
92         /*
93          * If the ecc fails, we return the error but otherwise
94          * leave the filesystem running.  We know any error is
95          * local to this block.
96          */
97         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
98         if (rc) {
99                 mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
100                      (unsigned long long)bh->b_blocknr);
101                 return rc;
102         }
103
104
105         if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
106                 ocfs2_error(sb,
107                             "Refcount block #%llu has bad signature %.*s",
108                             (unsigned long long)bh->b_blocknr, 7,
109                             rb->rf_signature);
110                 return -EINVAL;
111         }
112
113         if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
114                 ocfs2_error(sb,
115                             "Refcount block #%llu has an invalid rf_blkno "
116                             "of %llu",
117                             (unsigned long long)bh->b_blocknr,
118                             (unsigned long long)le64_to_cpu(rb->rf_blkno));
119                 return -EINVAL;
120         }
121
122         if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
123                 ocfs2_error(sb,
124                             "Refcount block #%llu has an invalid "
125                             "rf_fs_generation of #%u",
126                             (unsigned long long)bh->b_blocknr,
127                             le32_to_cpu(rb->rf_fs_generation));
128                 return -EINVAL;
129         }
130
131         return 0;
132 }
133
134 static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
135                                      u64 rb_blkno,
136                                      struct buffer_head **bh)
137 {
138         int rc;
139         struct buffer_head *tmp = *bh;
140
141         rc = ocfs2_read_block(ci, rb_blkno, &tmp,
142                               ocfs2_validate_refcount_block);
143
144         /* If ocfs2_read_block() got us a new bh, pass it up. */
145         if (!rc && !*bh)
146                 *bh = tmp;
147
148         return rc;
149 }
150
151 static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
152 {
153         struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
154
155         return rf->rf_blkno;
156 }
157
158 static struct super_block *
159 ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
160 {
161         struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
162
163         return rf->rf_sb;
164 }
165
166 static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
167 {
168         struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
169
170         spin_lock(&rf->rf_lock);
171 }
172
173 static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
174 {
175         struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
176
177         spin_unlock(&rf->rf_lock);
178 }
179
180 static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
181 {
182         struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
183
184         mutex_lock(&rf->rf_io_mutex);
185 }
186
187 static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
188 {
189         struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
190
191         mutex_unlock(&rf->rf_io_mutex);
192 }
193
194 static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
195         .co_owner               = ocfs2_refcount_cache_owner,
196         .co_get_super           = ocfs2_refcount_cache_get_super,
197         .co_cache_lock          = ocfs2_refcount_cache_lock,
198         .co_cache_unlock        = ocfs2_refcount_cache_unlock,
199         .co_io_lock             = ocfs2_refcount_cache_io_lock,
200         .co_io_unlock           = ocfs2_refcount_cache_io_unlock,
201 };
202
203 static struct ocfs2_refcount_tree *
204 ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
205 {
206         struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
207         struct ocfs2_refcount_tree *tree = NULL;
208
209         while (n) {
210                 tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
211
212                 if (blkno < tree->rf_blkno)
213                         n = n->rb_left;
214                 else if (blkno > tree->rf_blkno)
215                         n = n->rb_right;
216                 else
217                         return tree;
218         }
219
220         return NULL;
221 }
222
223 /* osb_lock is already locked. */
224 static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
225                                        struct ocfs2_refcount_tree *new)
226 {
227         u64 rf_blkno = new->rf_blkno;
228         struct rb_node *parent = NULL;
229         struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
230         struct ocfs2_refcount_tree *tmp;
231
232         while (*p) {
233                 parent = *p;
234
235                 tmp = rb_entry(parent, struct ocfs2_refcount_tree,
236                                rf_node);
237
238                 if (rf_blkno < tmp->rf_blkno)
239                         p = &(*p)->rb_left;
240                 else if (rf_blkno > tmp->rf_blkno)
241                         p = &(*p)->rb_right;
242                 else {
243                         /* This should never happen! */
244                         mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
245                              (unsigned long long)rf_blkno);
246                         BUG();
247                 }
248         }
249
250         rb_link_node(&new->rf_node, parent, p);
251         rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
252 }
253
254 static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
255 {
256         ocfs2_metadata_cache_exit(&tree->rf_ci);
257         ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
258         ocfs2_lock_res_free(&tree->rf_lockres);
259         kfree(tree);
260 }
261
262 static inline void
263 ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
264                                         struct ocfs2_refcount_tree *tree)
265 {
266         rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
267         if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
268                 osb->osb_ref_tree_lru = NULL;
269 }
270
271 static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
272                                         struct ocfs2_refcount_tree *tree)
273 {
274         spin_lock(&osb->osb_lock);
275         ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
276         spin_unlock(&osb->osb_lock);
277 }
278
279 static void ocfs2_kref_remove_refcount_tree(struct kref *kref)
280 {
281         struct ocfs2_refcount_tree *tree =
282                 container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
283
284         ocfs2_free_refcount_tree(tree);
285 }
286
287 static inline void
288 ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
289 {
290         kref_get(&tree->rf_getcnt);
291 }
292
293 static inline void
294 ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
295 {
296         kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
297 }
298
299 static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
300                                                struct super_block *sb)
301 {
302         ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
303         mutex_init(&new->rf_io_mutex);
304         new->rf_sb = sb;
305         spin_lock_init(&new->rf_lock);
306 }
307
308 static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
309                                         struct ocfs2_refcount_tree *new,
310                                         u64 rf_blkno, u32 generation)
311 {
312         init_rwsem(&new->rf_sem);
313         ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
314                                      rf_blkno, generation);
315 }
316
317 static struct ocfs2_refcount_tree*
318 ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
319 {
320         struct ocfs2_refcount_tree *new;
321
322         new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
323         if (!new)
324                 return NULL;
325
326         new->rf_blkno = rf_blkno;
327         kref_init(&new->rf_getcnt);
328         ocfs2_init_refcount_tree_ci(new, osb->sb);
329
330         return new;
331 }
332
333 static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
334                                    struct ocfs2_refcount_tree **ret_tree)
335 {
336         int ret = 0;
337         struct ocfs2_refcount_tree *tree, *new = NULL;
338         struct buffer_head *ref_root_bh = NULL;
339         struct ocfs2_refcount_block *ref_rb;
340
341         spin_lock(&osb->osb_lock);
342         if (osb->osb_ref_tree_lru &&
343             osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
344                 tree = osb->osb_ref_tree_lru;
345         else
346                 tree = ocfs2_find_refcount_tree(osb, rf_blkno);
347         if (tree)
348                 goto out;
349
350         spin_unlock(&osb->osb_lock);
351
352         new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
353         if (!new) {
354                 ret = -ENOMEM;
355                 mlog_errno(ret);
356                 return ret;
357         }
358         /*
359          * We need the generation to create the refcount tree lock and since
360          * it isn't changed during the tree modification, we are safe here to
361          * read without protection.
362          * We also have to purge the cache after we create the lock since the
363          * refcount block may have the stale data. It can only be trusted when
364          * we hold the refcount lock.
365          */
366         ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
367         if (ret) {
368                 mlog_errno(ret);
369                 ocfs2_metadata_cache_exit(&new->rf_ci);
370                 kfree(new);
371                 return ret;
372         }
373
374         ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
375         new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
376         ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
377                                       new->rf_generation);
378         ocfs2_metadata_cache_purge(&new->rf_ci);
379
380         spin_lock(&osb->osb_lock);
381         tree = ocfs2_find_refcount_tree(osb, rf_blkno);
382         if (tree)
383                 goto out;
384
385         ocfs2_insert_refcount_tree(osb, new);
386
387         tree = new;
388         new = NULL;
389
390 out:
391         *ret_tree = tree;
392
393         osb->osb_ref_tree_lru = tree;
394
395         spin_unlock(&osb->osb_lock);
396
397         if (new)
398                 ocfs2_free_refcount_tree(new);
399
400         brelse(ref_root_bh);
401         return ret;
402 }
403
404 static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
405 {
406         int ret;
407         struct buffer_head *di_bh = NULL;
408         struct ocfs2_dinode *di;
409
410         ret = ocfs2_read_inode_block(inode, &di_bh);
411         if (ret) {
412                 mlog_errno(ret);
413                 goto out;
414         }
415
416         BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
417
418         di = (struct ocfs2_dinode *)di_bh->b_data;
419         *ref_blkno = le64_to_cpu(di->i_refcount_loc);
420         brelse(di_bh);
421 out:
422         return ret;
423 }
424
425 static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
426                                       struct ocfs2_refcount_tree *tree, int rw)
427 {
428         int ret;
429
430         ret = ocfs2_refcount_lock(tree, rw);
431         if (ret) {
432                 mlog_errno(ret);
433                 goto out;
434         }
435
436         if (rw)
437                 down_write(&tree->rf_sem);
438         else
439                 down_read(&tree->rf_sem);
440
441 out:
442         return ret;
443 }
444
445 /*
446  * Lock the refcount tree pointed by ref_blkno and return the tree.
447  * In most case, we lock the tree and read the refcount block.
448  * So read it here if the caller really needs it.
449  *
450  * If the tree has been re-created by other node, it will free the
451  * old one and re-create it.
452  */
453 int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
454                              u64 ref_blkno, int rw,
455                              struct ocfs2_refcount_tree **ret_tree,
456                              struct buffer_head **ref_bh)
457 {
458         int ret, delete_tree = 0;
459         struct ocfs2_refcount_tree *tree = NULL;
460         struct buffer_head *ref_root_bh = NULL;
461         struct ocfs2_refcount_block *rb;
462
463 again:
464         ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
465         if (ret) {
466                 mlog_errno(ret);
467                 return ret;
468         }
469
470         ocfs2_refcount_tree_get(tree);
471
472         ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
473         if (ret) {
474                 mlog_errno(ret);
475                 ocfs2_refcount_tree_put(tree);
476                 goto out;
477         }
478
479         ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
480                                         &ref_root_bh);
481         if (ret) {
482                 mlog_errno(ret);
483                 ocfs2_unlock_refcount_tree(osb, tree, rw);
484                 ocfs2_refcount_tree_put(tree);
485                 goto out;
486         }
487
488         rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
489         /*
490          * If the refcount block has been freed and re-created, we may need
491          * to recreate the refcount tree also.
492          *
493          * Here we just remove the tree from the rb-tree, and the last
494          * kref holder will unlock and delete this refcount_tree.
495          * Then we goto "again" and ocfs2_get_refcount_tree will create
496          * the new refcount tree for us.
497          */
498         if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
499                 if (!tree->rf_removed) {
500                         ocfs2_erase_refcount_tree_from_list(osb, tree);
501                         tree->rf_removed = 1;
502                         delete_tree = 1;
503                 }
504
505                 ocfs2_unlock_refcount_tree(osb, tree, rw);
506                 /*
507                  * We get an extra reference when we create the refcount
508                  * tree, so another put will destroy it.
509                  */
510                 if (delete_tree)
511                         ocfs2_refcount_tree_put(tree);
512                 brelse(ref_root_bh);
513                 ref_root_bh = NULL;
514                 goto again;
515         }
516
517         *ret_tree = tree;
518         if (ref_bh) {
519                 *ref_bh = ref_root_bh;
520                 ref_root_bh = NULL;
521         }
522 out:
523         brelse(ref_root_bh);
524         return ret;
525 }
526
527 void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
528                                 struct ocfs2_refcount_tree *tree, int rw)
529 {
530         if (rw)
531                 up_write(&tree->rf_sem);
532         else
533                 up_read(&tree->rf_sem);
534
535         ocfs2_refcount_unlock(tree, rw);
536         ocfs2_refcount_tree_put(tree);
537 }
538
539 void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
540 {
541         struct rb_node *node;
542         struct ocfs2_refcount_tree *tree;
543         struct rb_root *root = &osb->osb_rf_lock_tree;
544
545         while ((node = rb_last(root)) != NULL) {
546                 tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
547
548                 mlog(0, "Purge tree %llu\n",
549                      (unsigned long long) tree->rf_blkno);
550
551                 rb_erase(&tree->rf_node, root);
552                 ocfs2_free_refcount_tree(tree);
553         }
554 }
555
556 /*
557  * Create a refcount tree for an inode.
558  * We take for granted that the inode is already locked.
559  */
560 static int ocfs2_create_refcount_tree(struct inode *inode,
561                                       struct buffer_head *di_bh)
562 {
563         int ret;
564         handle_t *handle = NULL;
565         struct ocfs2_alloc_context *meta_ac = NULL;
566         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
567         struct ocfs2_inode_info *oi = OCFS2_I(inode);
568         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
569         struct buffer_head *new_bh = NULL;
570         struct ocfs2_refcount_block *rb;
571         struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
572         u16 suballoc_bit_start;
573         u32 num_got;
574         u64 first_blkno;
575
576         BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
577
578         mlog(0, "create tree for inode %lu\n", inode->i_ino);
579
580         ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
581         if (ret) {
582                 mlog_errno(ret);
583                 goto out;
584         }
585
586         handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
587         if (IS_ERR(handle)) {
588                 ret = PTR_ERR(handle);
589                 mlog_errno(ret);
590                 goto out;
591         }
592
593         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
594                                       OCFS2_JOURNAL_ACCESS_WRITE);
595         if (ret) {
596                 mlog_errno(ret);
597                 goto out_commit;
598         }
599
600         ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
601                                    &suballoc_bit_start, &num_got,
602                                    &first_blkno);
603         if (ret) {
604                 mlog_errno(ret);
605                 goto out_commit;
606         }
607
608         new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
609         if (!new_tree) {
610                 ret = -ENOMEM;
611                 mlog_errno(ret);
612                 goto out_commit;
613         }
614
615         new_bh = sb_getblk(inode->i_sb, first_blkno);
616         ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
617
618         ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
619                                       OCFS2_JOURNAL_ACCESS_CREATE);
620         if (ret) {
621                 mlog_errno(ret);
622                 goto out_commit;
623         }
624
625         /* Initialize ocfs2_refcount_block. */
626         rb = (struct ocfs2_refcount_block *)new_bh->b_data;
627         memset(rb, 0, inode->i_sb->s_blocksize);
628         strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
629         rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
630         rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
631         rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
632         rb->rf_blkno = cpu_to_le64(first_blkno);
633         rb->rf_count = cpu_to_le32(1);
634         rb->rf_records.rl_count =
635                         cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
636         spin_lock(&osb->osb_lock);
637         rb->rf_generation = osb->s_next_generation++;
638         spin_unlock(&osb->osb_lock);
639
640         ocfs2_journal_dirty(handle, new_bh);
641
642         spin_lock(&oi->ip_lock);
643         oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
644         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
645         di->i_refcount_loc = cpu_to_le64(first_blkno);
646         spin_unlock(&oi->ip_lock);
647
648         mlog(0, "created tree for inode %lu, refblock %llu\n",
649              inode->i_ino, (unsigned long long)first_blkno);
650
651         ocfs2_journal_dirty(handle, di_bh);
652
653         /*
654          * We have to init the tree lock here since it will use
655          * the generation number to create it.
656          */
657         new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
658         ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
659                                       new_tree->rf_generation);
660
661         spin_lock(&osb->osb_lock);
662         tree = ocfs2_find_refcount_tree(osb, first_blkno);
663
664         /*
665          * We've just created a new refcount tree in this block.  If
666          * we found a refcount tree on the ocfs2_super, it must be
667          * one we just deleted.  We free the old tree before
668          * inserting the new tree.
669          */
670         BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
671         if (tree)
672                 ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
673         ocfs2_insert_refcount_tree(osb, new_tree);
674         spin_unlock(&osb->osb_lock);
675         new_tree = NULL;
676         if (tree)
677                 ocfs2_refcount_tree_put(tree);
678
679 out_commit:
680         ocfs2_commit_trans(osb, handle);
681
682 out:
683         if (new_tree) {
684                 ocfs2_metadata_cache_exit(&new_tree->rf_ci);
685                 kfree(new_tree);
686         }
687
688         brelse(new_bh);
689         if (meta_ac)
690                 ocfs2_free_alloc_context(meta_ac);
691
692         return ret;
693 }
694
695 static int ocfs2_set_refcount_tree(struct inode *inode,
696                                    struct buffer_head *di_bh,
697                                    u64 refcount_loc)
698 {
699         int ret;
700         handle_t *handle = NULL;
701         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
702         struct ocfs2_inode_info *oi = OCFS2_I(inode);
703         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
704         struct buffer_head *ref_root_bh = NULL;
705         struct ocfs2_refcount_block *rb;
706         struct ocfs2_refcount_tree *ref_tree;
707
708         BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
709
710         ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
711                                        &ref_tree, &ref_root_bh);
712         if (ret) {
713                 mlog_errno(ret);
714                 return ret;
715         }
716
717         handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
718         if (IS_ERR(handle)) {
719                 ret = PTR_ERR(handle);
720                 mlog_errno(ret);
721                 goto out;
722         }
723
724         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
725                                       OCFS2_JOURNAL_ACCESS_WRITE);
726         if (ret) {
727                 mlog_errno(ret);
728                 goto out_commit;
729         }
730
731         ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
732                                       OCFS2_JOURNAL_ACCESS_WRITE);
733         if (ret) {
734                 mlog_errno(ret);
735                 goto out_commit;
736         }
737
738         rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
739         le32_add_cpu(&rb->rf_count, 1);
740
741         ocfs2_journal_dirty(handle, ref_root_bh);
742
743         spin_lock(&oi->ip_lock);
744         oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
745         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
746         di->i_refcount_loc = cpu_to_le64(refcount_loc);
747         spin_unlock(&oi->ip_lock);
748         ocfs2_journal_dirty(handle, di_bh);
749
750 out_commit:
751         ocfs2_commit_trans(osb, handle);
752 out:
753         ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
754         brelse(ref_root_bh);
755
756         return ret;
757 }
758
759 int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
760 {
761         int ret, delete_tree = 0;
762         handle_t *handle = NULL;
763         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
764         struct ocfs2_inode_info *oi = OCFS2_I(inode);
765         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
766         struct ocfs2_refcount_block *rb;
767         struct inode *alloc_inode = NULL;
768         struct buffer_head *alloc_bh = NULL;
769         struct buffer_head *blk_bh = NULL;
770         struct ocfs2_refcount_tree *ref_tree;
771         int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS;
772         u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
773         u16 bit = 0;
774
775         if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
776                 return 0;
777
778         BUG_ON(!ref_blkno);
779         ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
780         if (ret) {
781                 mlog_errno(ret);
782                 return ret;
783         }
784
785         rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
786
787         /*
788          * If we are the last user, we need to free the block.
789          * So lock the allocator ahead.
790          */
791         if (le32_to_cpu(rb->rf_count) == 1) {
792                 blk = le64_to_cpu(rb->rf_blkno);
793                 bit = le16_to_cpu(rb->rf_suballoc_bit);
794                 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
795
796                 alloc_inode = ocfs2_get_system_file_inode(osb,
797                                         EXTENT_ALLOC_SYSTEM_INODE,
798                                         le16_to_cpu(rb->rf_suballoc_slot));
799                 if (!alloc_inode) {
800                         ret = -ENOMEM;
801                         mlog_errno(ret);
802                         goto out;
803                 }
804                 mutex_lock(&alloc_inode->i_mutex);
805
806                 ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
807                 if (ret) {
808                         mlog_errno(ret);
809                         goto out_mutex;
810                 }
811
812                 credits += OCFS2_SUBALLOC_FREE;
813         }
814
815         handle = ocfs2_start_trans(osb, credits);
816         if (IS_ERR(handle)) {
817                 ret = PTR_ERR(handle);
818                 mlog_errno(ret);
819                 goto out_unlock;
820         }
821
822         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
823                                       OCFS2_JOURNAL_ACCESS_WRITE);
824         if (ret) {
825                 mlog_errno(ret);
826                 goto out_commit;
827         }
828
829         ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
830                                       OCFS2_JOURNAL_ACCESS_WRITE);
831         if (ret) {
832                 mlog_errno(ret);
833                 goto out_commit;
834         }
835
836         spin_lock(&oi->ip_lock);
837         oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
838         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
839         di->i_refcount_loc = 0;
840         spin_unlock(&oi->ip_lock);
841         ocfs2_journal_dirty(handle, di_bh);
842
843         le32_add_cpu(&rb->rf_count , -1);
844         ocfs2_journal_dirty(handle, blk_bh);
845
846         if (!rb->rf_count) {
847                 delete_tree = 1;
848                 ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
849                 ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
850                                                alloc_bh, bit, bg_blkno, 1);
851                 if (ret)
852                         mlog_errno(ret);
853         }
854
855 out_commit:
856         ocfs2_commit_trans(osb, handle);
857 out_unlock:
858         if (alloc_inode) {
859                 ocfs2_inode_unlock(alloc_inode, 1);
860                 brelse(alloc_bh);
861         }
862 out_mutex:
863         if (alloc_inode) {
864                 mutex_unlock(&alloc_inode->i_mutex);
865                 iput(alloc_inode);
866         }
867 out:
868         ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
869         if (delete_tree)
870                 ocfs2_refcount_tree_put(ref_tree);
871         brelse(blk_bh);
872
873         return ret;
874 }
875
876 static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci,
877                                           struct buffer_head *ref_leaf_bh,
878                                           u64 cpos, unsigned int len,
879                                           struct ocfs2_refcount_rec *ret_rec,
880                                           int *index)
881 {
882         int i = 0;
883         struct ocfs2_refcount_block *rb =
884                 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
885         struct ocfs2_refcount_rec *rec = NULL;
886
887         for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
888                 rec = &rb->rf_records.rl_recs[i];
889
890                 if (le64_to_cpu(rec->r_cpos) +
891                     le32_to_cpu(rec->r_clusters) <= cpos)
892                         continue;
893                 else if (le64_to_cpu(rec->r_cpos) > cpos)
894                         break;
895
896                 /* ok, cpos fail in this rec. Just return. */
897                 if (ret_rec)
898                         *ret_rec = *rec;
899                 goto out;
900         }
901
902         if (ret_rec) {
903                 /* We meet with a hole here, so fake the rec. */
904                 ret_rec->r_cpos = cpu_to_le64(cpos);
905                 ret_rec->r_refcount = 0;
906                 if (i < le16_to_cpu(rb->rf_records.rl_used) &&
907                     le64_to_cpu(rec->r_cpos) < cpos + len)
908                         ret_rec->r_clusters =
909                                 cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
910                 else
911                         ret_rec->r_clusters = cpu_to_le32(len);
912         }
913
914 out:
915         *index = i;
916 }
917
918 /*
919  * Try to remove refcount tree. The mechanism is:
920  * 1) Check whether i_clusters == 0, if no, exit.
921  * 2) check whether we have i_xattr_loc in dinode. if yes, exit.
922  * 3) Check whether we have inline xattr stored outside, if yes, exit.
923  * 4) Remove the tree.
924  */
925 int ocfs2_try_remove_refcount_tree(struct inode *inode,
926                                    struct buffer_head *di_bh)
927 {
928         int ret;
929         struct ocfs2_inode_info *oi = OCFS2_I(inode);
930         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
931
932         down_write(&oi->ip_xattr_sem);
933         down_write(&oi->ip_alloc_sem);
934
935         if (oi->ip_clusters)
936                 goto out;
937
938         if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc)
939                 goto out;
940
941         if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL &&
942             ocfs2_has_inline_xattr_value_outside(inode, di))
943                 goto out;
944
945         ret = ocfs2_remove_refcount_tree(inode, di_bh);
946         if (ret)
947                 mlog_errno(ret);
948 out:
949         up_write(&oi->ip_alloc_sem);
950         up_write(&oi->ip_xattr_sem);
951         return 0;
952 }
953
954 /*
955  * Find the end range for a leaf refcount block indicated by
956  * el->l_recs[index].e_blkno.
957  */
958 static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
959                                        struct buffer_head *ref_root_bh,
960                                        struct ocfs2_extent_block *eb,
961                                        struct ocfs2_extent_list *el,
962                                        int index,  u32 *cpos_end)
963 {
964         int ret, i, subtree_root;
965         u32 cpos;
966         u64 blkno;
967         struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
968         struct ocfs2_path *left_path = NULL, *right_path = NULL;
969         struct ocfs2_extent_tree et;
970         struct ocfs2_extent_list *tmp_el;
971
972         if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
973                 /*
974                  * We have a extent rec after index, so just use the e_cpos
975                  * of the next extent rec.
976                  */
977                 *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
978                 return 0;
979         }
980
981         if (!eb || (eb && !eb->h_next_leaf_blk)) {
982                 /*
983                  * We are the last extent rec, so any high cpos should
984                  * be stored in this leaf refcount block.
985                  */
986                 *cpos_end = UINT_MAX;
987                 return 0;
988         }
989
990         /*
991          * If the extent block isn't the last one, we have to find
992          * the subtree root between this extent block and the next
993          * leaf extent block and get the corresponding e_cpos from
994          * the subroot. Otherwise we may corrupt the b-tree.
995          */
996         ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
997
998         left_path = ocfs2_new_path_from_et(&et);
999         if (!left_path) {
1000                 ret = -ENOMEM;
1001                 mlog_errno(ret);
1002                 goto out;
1003         }
1004
1005         cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
1006         ret = ocfs2_find_path(ci, left_path, cpos);
1007         if (ret) {
1008                 mlog_errno(ret);
1009                 goto out;
1010         }
1011
1012         right_path = ocfs2_new_path_from_path(left_path);
1013         if (!right_path) {
1014                 ret = -ENOMEM;
1015                 mlog_errno(ret);
1016                 goto out;
1017         }
1018
1019         ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
1020         if (ret) {
1021                 mlog_errno(ret);
1022                 goto out;
1023         }
1024
1025         ret = ocfs2_find_path(ci, right_path, cpos);
1026         if (ret) {
1027                 mlog_errno(ret);
1028                 goto out;
1029         }
1030
1031         subtree_root = ocfs2_find_subtree_root(&et, left_path,
1032                                                right_path);
1033
1034         tmp_el = left_path->p_node[subtree_root].el;
1035         blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
1036         for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) {
1037                 if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
1038                         *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
1039                         break;
1040                 }
1041         }
1042
1043         BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec));
1044
1045 out:
1046         ocfs2_free_path(left_path);
1047         ocfs2_free_path(right_path);
1048         return ret;
1049 }
1050
1051 /*
1052  * Given a cpos and len, try to find the refcount record which contains cpos.
1053  * 1. If cpos can be found in one refcount record, return the record.
1054  * 2. If cpos can't be found, return a fake record which start from cpos
1055  *    and end at a small value between cpos+len and start of the next record.
1056  *    This fake record has r_refcount = 0.
1057  */
1058 static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
1059                                   struct buffer_head *ref_root_bh,
1060                                   u64 cpos, unsigned int len,
1061                                   struct ocfs2_refcount_rec *ret_rec,
1062                                   int *index,
1063                                   struct buffer_head **ret_bh)
1064 {
1065         int ret = 0, i, found;
1066         u32 low_cpos, uninitialized_var(cpos_end);
1067         struct ocfs2_extent_list *el;
1068         struct ocfs2_extent_rec *rec = NULL;
1069         struct ocfs2_extent_block *eb = NULL;
1070         struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
1071         struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1072         struct ocfs2_refcount_block *rb =
1073                         (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1074
1075         if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) {
1076                 ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len,
1077                                               ret_rec, index);
1078                 *ret_bh = ref_root_bh;
1079                 get_bh(ref_root_bh);
1080                 return 0;
1081         }
1082
1083         el = &rb->rf_list;
1084         low_cpos = cpos & OCFS2_32BIT_POS_MASK;
1085
1086         if (el->l_tree_depth) {
1087                 ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
1088                 if (ret) {
1089                         mlog_errno(ret);
1090                         goto out;
1091                 }
1092
1093                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1094                 el = &eb->h_list;
1095
1096                 if (el->l_tree_depth) {
1097                         ocfs2_error(sb,
1098                         "refcount tree %llu has non zero tree "
1099                         "depth in leaf btree tree block %llu\n",
1100                         (unsigned long long)ocfs2_metadata_cache_owner(ci),
1101                         (unsigned long long)eb_bh->b_blocknr);
1102                         ret = -EROFS;
1103                         goto out;
1104                 }
1105         }
1106
1107         found = 0;
1108         for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1109                 rec = &el->l_recs[i];
1110
1111                 if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
1112                         found = 1;
1113                         break;
1114                 }
1115         }
1116
1117         if (found) {
1118                 ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
1119                                                   eb, el, i, &cpos_end);
1120                 if (ret) {
1121                         mlog_errno(ret);
1122                         goto out;
1123                 }
1124
1125                 if (cpos_end < low_cpos + len)
1126                         len = cpos_end - low_cpos;
1127         }
1128
1129         ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
1130                                         &ref_leaf_bh);
1131         if (ret) {
1132                 mlog_errno(ret);
1133                 goto out;
1134         }
1135
1136         ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len,
1137                                       ret_rec, index);
1138         *ret_bh = ref_leaf_bh;
1139 out:
1140         brelse(eb_bh);
1141         return ret;
1142 }
1143
1144 enum ocfs2_ref_rec_contig {
1145         REF_CONTIG_NONE = 0,
1146         REF_CONTIG_LEFT,
1147         REF_CONTIG_RIGHT,
1148         REF_CONTIG_LEFTRIGHT,
1149 };
1150
1151 static enum ocfs2_ref_rec_contig
1152         ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
1153                                     int index)
1154 {
1155         if ((rb->rf_records.rl_recs[index].r_refcount ==
1156             rb->rf_records.rl_recs[index + 1].r_refcount) &&
1157             (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
1158             le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) ==
1159             le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
1160                 return REF_CONTIG_RIGHT;
1161
1162         return REF_CONTIG_NONE;
1163 }
1164
1165 static enum ocfs2_ref_rec_contig
1166         ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
1167                                   int index)
1168 {
1169         enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE;
1170
1171         if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
1172                 ret = ocfs2_refcount_rec_adjacent(rb, index);
1173
1174         if (index > 0) {
1175                 enum ocfs2_ref_rec_contig tmp;
1176
1177                 tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
1178
1179                 if (tmp == REF_CONTIG_RIGHT) {
1180                         if (ret == REF_CONTIG_RIGHT)
1181                                 ret = REF_CONTIG_LEFTRIGHT;
1182                         else
1183                                 ret = REF_CONTIG_LEFT;
1184                 }
1185         }
1186
1187         return ret;
1188 }
1189
1190 static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
1191                                            int index)
1192 {
1193         BUG_ON(rb->rf_records.rl_recs[index].r_refcount !=
1194                rb->rf_records.rl_recs[index+1].r_refcount);
1195
1196         le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
1197                      le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
1198
1199         if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
1200                 memmove(&rb->rf_records.rl_recs[index + 1],
1201                         &rb->rf_records.rl_recs[index + 2],
1202                         sizeof(struct ocfs2_refcount_rec) *
1203                         (le16_to_cpu(rb->rf_records.rl_used) - index - 2));
1204
1205         memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1],
1206                0, sizeof(struct ocfs2_refcount_rec));
1207         le16_add_cpu(&rb->rf_records.rl_used, -1);
1208 }
1209
1210 /*
1211  * Merge the refcount rec if we are contiguous with the adjacent recs.
1212  */
1213 static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
1214                                      int index)
1215 {
1216         enum ocfs2_ref_rec_contig contig =
1217                                 ocfs2_refcount_rec_contig(rb, index);
1218
1219         if (contig == REF_CONTIG_NONE)
1220                 return;
1221
1222         if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
1223                 BUG_ON(index == 0);
1224                 index--;
1225         }
1226
1227         ocfs2_rotate_refcount_rec_left(rb, index);
1228
1229         if (contig == REF_CONTIG_LEFTRIGHT)
1230                 ocfs2_rotate_refcount_rec_left(rb, index);
1231 }
1232
1233 /*
1234  * Change the refcount indexed by "index" in ref_bh.
1235  * If refcount reaches 0, remove it.
1236  */
1237 static int ocfs2_change_refcount_rec(handle_t *handle,
1238                                      struct ocfs2_caching_info *ci,
1239                                      struct buffer_head *ref_leaf_bh,
1240                                      int index, int merge, int change)
1241 {
1242         int ret;
1243         struct ocfs2_refcount_block *rb =
1244                         (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1245         struct ocfs2_refcount_list *rl = &rb->rf_records;
1246         struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
1247
1248         ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1249                                       OCFS2_JOURNAL_ACCESS_WRITE);
1250         if (ret) {
1251                 mlog_errno(ret);
1252                 goto out;
1253         }
1254
1255         mlog(0, "change index %d, old count %u, change %d\n", index,
1256              le32_to_cpu(rec->r_refcount), change);
1257         le32_add_cpu(&rec->r_refcount, change);
1258
1259         if (!rec->r_refcount) {
1260                 if (index != le16_to_cpu(rl->rl_used) - 1) {
1261                         memmove(rec, rec + 1,
1262                                 (le16_to_cpu(rl->rl_used) - index - 1) *
1263                                 sizeof(struct ocfs2_refcount_rec));
1264                         memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
1265                                0, sizeof(struct ocfs2_refcount_rec));
1266                 }
1267
1268                 le16_add_cpu(&rl->rl_used, -1);
1269         } else if (merge)
1270                 ocfs2_refcount_rec_merge(rb, index);
1271
1272         ocfs2_journal_dirty(handle, ref_leaf_bh);
1273 out:
1274         return ret;
1275 }
1276
1277 static int ocfs2_expand_inline_ref_root(handle_t *handle,
1278                                         struct ocfs2_caching_info *ci,
1279                                         struct buffer_head *ref_root_bh,
1280                                         struct buffer_head **ref_leaf_bh,
1281                                         struct ocfs2_alloc_context *meta_ac)
1282 {
1283         int ret;
1284         u16 suballoc_bit_start;
1285         u32 num_got;
1286         u64 blkno;
1287         struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1288         struct buffer_head *new_bh = NULL;
1289         struct ocfs2_refcount_block *new_rb;
1290         struct ocfs2_refcount_block *root_rb =
1291                         (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1292
1293         ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
1294                                       OCFS2_JOURNAL_ACCESS_WRITE);
1295         if (ret) {
1296                 mlog_errno(ret);
1297                 goto out;
1298         }
1299
1300         ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
1301                                    &suballoc_bit_start, &num_got,
1302                                    &blkno);
1303         if (ret) {
1304                 mlog_errno(ret);
1305                 goto out;
1306         }
1307
1308         new_bh = sb_getblk(sb, blkno);
1309         if (new_bh == NULL) {
1310                 ret = -EIO;
1311                 mlog_errno(ret);
1312                 goto out;
1313         }
1314         ocfs2_set_new_buffer_uptodate(ci, new_bh);
1315
1316         ret = ocfs2_journal_access_rb(handle, ci, new_bh,
1317                                       OCFS2_JOURNAL_ACCESS_CREATE);
1318         if (ret) {
1319                 mlog_errno(ret);
1320                 goto out;
1321         }
1322
1323         /*
1324          * Initialize ocfs2_refcount_block.
1325          * It should contain the same information as the old root.
1326          * so just memcpy it and change the corresponding field.
1327          */
1328         memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
1329
1330         new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1331         new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1332         new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1333         new_rb->rf_blkno = cpu_to_le64(blkno);
1334         new_rb->rf_cpos = cpu_to_le32(0);
1335         new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
1336         new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
1337         ocfs2_journal_dirty(handle, new_bh);
1338
1339         /* Now change the root. */
1340         memset(&root_rb->rf_list, 0, sb->s_blocksize -
1341                offsetof(struct ocfs2_refcount_block, rf_list));
1342         root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
1343         root_rb->rf_clusters = cpu_to_le32(1);
1344         root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
1345         root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
1346         root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
1347         root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL);
1348
1349         ocfs2_journal_dirty(handle, ref_root_bh);
1350
1351         mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno,
1352              le16_to_cpu(new_rb->rf_records.rl_used));
1353
1354         *ref_leaf_bh = new_bh;
1355         new_bh = NULL;
1356 out:
1357         brelse(new_bh);
1358         return ret;
1359 }
1360
1361 static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
1362                                            struct ocfs2_refcount_rec *next)
1363 {
1364         if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <=
1365                 ocfs2_get_ref_rec_low_cpos(next))
1366                 return 1;
1367
1368         return 0;
1369 }
1370
1371 static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
1372 {
1373         const struct ocfs2_refcount_rec *l = a, *r = b;
1374         u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
1375         u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
1376
1377         if (l_cpos > r_cpos)
1378                 return 1;
1379         if (l_cpos < r_cpos)
1380                 return -1;
1381         return 0;
1382 }
1383
1384 static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
1385 {
1386         const struct ocfs2_refcount_rec *l = a, *r = b;
1387         u64 l_cpos = le64_to_cpu(l->r_cpos);
1388         u64 r_cpos = le64_to_cpu(r->r_cpos);
1389
1390         if (l_cpos > r_cpos)
1391                 return 1;
1392         if (l_cpos < r_cpos)
1393                 return -1;
1394         return 0;
1395 }
1396
1397 static void swap_refcount_rec(void *a, void *b, int size)
1398 {
1399         struct ocfs2_refcount_rec *l = a, *r = b, tmp;
1400
1401         tmp = *(struct ocfs2_refcount_rec *)l;
1402         *(struct ocfs2_refcount_rec *)l =
1403                         *(struct ocfs2_refcount_rec *)r;
1404         *(struct ocfs2_refcount_rec *)r = tmp;
1405 }
1406
1407 /*
1408  * The refcount cpos are ordered by their 64bit cpos,
1409  * But we will use the low 32 bit to be the e_cpos in the b-tree.
1410  * So we need to make sure that this pos isn't intersected with others.
1411  *
1412  * Note: The refcount block is already sorted by their low 32 bit cpos,
1413  *       So just try the middle pos first, and we will exit when we find
1414  *       the good position.
1415  */
1416 static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
1417                                          u32 *split_pos, int *split_index)
1418 {
1419         int num_used = le16_to_cpu(rl->rl_used);
1420         int delta, middle = num_used / 2;
1421
1422         for (delta = 0; delta < middle; delta++) {
1423                 /* Let's check delta earlier than middle */
1424                 if (ocfs2_refcount_rec_no_intersect(
1425                                         &rl->rl_recs[middle - delta - 1],
1426                                         &rl->rl_recs[middle - delta])) {
1427                         *split_index = middle - delta;
1428                         break;
1429                 }
1430
1431                 /* For even counts, don't walk off the end */
1432                 if ((middle + delta + 1) == num_used)
1433                         continue;
1434
1435                 /* Now try delta past middle */
1436                 if (ocfs2_refcount_rec_no_intersect(
1437                                         &rl->rl_recs[middle + delta],
1438                                         &rl->rl_recs[middle + delta + 1])) {
1439                         *split_index = middle + delta + 1;
1440                         break;
1441                 }
1442         }
1443
1444         if (delta >= middle)
1445                 return -ENOSPC;
1446
1447         *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
1448         return 0;
1449 }
1450
1451 static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
1452                                             struct buffer_head *new_bh,
1453                                             u32 *split_cpos)
1454 {
1455         int split_index = 0, num_moved, ret;
1456         u32 cpos = 0;
1457         struct ocfs2_refcount_block *rb =
1458                         (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1459         struct ocfs2_refcount_list *rl = &rb->rf_records;
1460         struct ocfs2_refcount_block *new_rb =
1461                         (struct ocfs2_refcount_block *)new_bh->b_data;
1462         struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
1463
1464         mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n",
1465              (unsigned long long)ref_leaf_bh->b_blocknr,
1466              le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used));
1467
1468         /*
1469          * XXX: Improvement later.
1470          * If we know all the high 32 bit cpos is the same, no need to sort.
1471          *
1472          * In order to make the whole process safe, we do:
1473          * 1. sort the entries by their low 32 bit cpos first so that we can
1474          *    find the split cpos easily.
1475          * 2. call ocfs2_insert_extent to insert the new refcount block.
1476          * 3. move the refcount rec to the new block.
1477          * 4. sort the entries by their 64 bit cpos.
1478          * 5. dirty the new_rb and rb.
1479          */
1480         sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1481              sizeof(struct ocfs2_refcount_rec),
1482              cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
1483
1484         ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
1485         if (ret) {
1486                 mlog_errno(ret);
1487                 return ret;
1488         }
1489
1490         new_rb->rf_cpos = cpu_to_le32(cpos);
1491
1492         /* move refcount records starting from split_index to the new block. */
1493         num_moved = le16_to_cpu(rl->rl_used) - split_index;
1494         memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
1495                num_moved * sizeof(struct ocfs2_refcount_rec));
1496
1497         /*ok, remove the entries we just moved over to the other block. */
1498         memset(&rl->rl_recs[split_index], 0,
1499                num_moved * sizeof(struct ocfs2_refcount_rec));
1500
1501         /* change old and new rl_used accordingly. */
1502         le16_add_cpu(&rl->rl_used, -num_moved);
1503         new_rl->rl_used = cpu_to_le16(num_moved);
1504
1505         sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1506              sizeof(struct ocfs2_refcount_rec),
1507              cmp_refcount_rec_by_cpos, swap_refcount_rec);
1508
1509         sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
1510              sizeof(struct ocfs2_refcount_rec),
1511              cmp_refcount_rec_by_cpos, swap_refcount_rec);
1512
1513         *split_cpos = cpos;
1514         return 0;
1515 }
1516
1517 static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1518                                          struct ocfs2_caching_info *ci,
1519                                          struct buffer_head *ref_root_bh,
1520                                          struct buffer_head *ref_leaf_bh,
1521                                          struct ocfs2_alloc_context *meta_ac)
1522 {
1523         int ret;
1524         u16 suballoc_bit_start;
1525         u32 num_got, new_cpos;
1526         u64 blkno;
1527         struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1528         struct ocfs2_refcount_block *root_rb =
1529                         (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1530         struct buffer_head *new_bh = NULL;
1531         struct ocfs2_refcount_block *new_rb;
1532         struct ocfs2_extent_tree ref_et;
1533
1534         BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL));
1535
1536         ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
1537                                       OCFS2_JOURNAL_ACCESS_WRITE);
1538         if (ret) {
1539                 mlog_errno(ret);
1540                 goto out;
1541         }
1542
1543         ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1544                                       OCFS2_JOURNAL_ACCESS_WRITE);
1545         if (ret) {
1546                 mlog_errno(ret);
1547                 goto out;
1548         }
1549
1550         ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
1551                                    &suballoc_bit_start, &num_got,
1552                                    &blkno);
1553         if (ret) {
1554                 mlog_errno(ret);
1555                 goto out;
1556         }
1557
1558         new_bh = sb_getblk(sb, blkno);
1559         if (new_bh == NULL) {
1560                 ret = -EIO;
1561                 mlog_errno(ret);
1562                 goto out;
1563         }
1564         ocfs2_set_new_buffer_uptodate(ci, new_bh);
1565
1566         ret = ocfs2_journal_access_rb(handle, ci, new_bh,
1567                                       OCFS2_JOURNAL_ACCESS_CREATE);
1568         if (ret) {
1569                 mlog_errno(ret);
1570                 goto out;
1571         }
1572
1573         /* Initialize ocfs2_refcount_block. */
1574         new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1575         memset(new_rb, 0, sb->s_blocksize);
1576         strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1577         new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1578         new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1579         new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1580         new_rb->rf_blkno = cpu_to_le64(blkno);
1581         new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
1582         new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
1583         new_rb->rf_records.rl_count =
1584                                 cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
1585         new_rb->rf_generation = root_rb->rf_generation;
1586
1587         ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos);
1588         if (ret) {
1589                 mlog_errno(ret);
1590                 goto out;
1591         }
1592
1593         ocfs2_journal_dirty(handle, ref_leaf_bh);
1594         ocfs2_journal_dirty(handle, new_bh);
1595
1596         ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
1597
1598         mlog(0, "insert new leaf block %llu at %u\n",
1599              (unsigned long long)new_bh->b_blocknr, new_cpos);
1600
1601         /* Insert the new leaf block with the specific offset cpos. */
1602         ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
1603                                   1, 0, meta_ac);
1604         if (ret)
1605                 mlog_errno(ret);
1606
1607 out:
1608         brelse(new_bh);
1609         return ret;
1610 }
1611
1612 static int ocfs2_expand_refcount_tree(handle_t *handle,
1613                                       struct ocfs2_caching_info *ci,
1614                                       struct buffer_head *ref_root_bh,
1615                                       struct buffer_head *ref_leaf_bh,
1616                                       struct ocfs2_alloc_context *meta_ac)
1617 {
1618         int ret;
1619         struct buffer_head *expand_bh = NULL;
1620
1621         if (ref_root_bh == ref_leaf_bh) {
1622                 /*
1623                  * the old root bh hasn't been expanded to a b-tree,
1624                  * so expand it first.
1625                  */
1626                 ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
1627                                                    &expand_bh, meta_ac);
1628                 if (ret) {
1629                         mlog_errno(ret);
1630                         goto out;
1631                 }
1632         } else {
1633                 expand_bh = ref_leaf_bh;
1634                 get_bh(expand_bh);
1635         }
1636
1637
1638         /* Now add a new refcount block into the tree.*/
1639         ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
1640                                             expand_bh, meta_ac);
1641         if (ret)
1642                 mlog_errno(ret);
1643 out:
1644         brelse(expand_bh);
1645         return ret;
1646 }
1647
1648 /*
1649  * Adjust the extent rec in b-tree representing ref_leaf_bh.
1650  *
1651  * Only called when we have inserted a new refcount rec at index 0
1652  * which means ocfs2_extent_rec.e_cpos may need some change.
1653  */
1654 static int ocfs2_adjust_refcount_rec(handle_t *handle,
1655                                      struct ocfs2_caching_info *ci,
1656                                      struct buffer_head *ref_root_bh,
1657                                      struct buffer_head *ref_leaf_bh,
1658                                      struct ocfs2_refcount_rec *rec)
1659 {
1660         int ret = 0, i;
1661         u32 new_cpos, old_cpos;
1662         struct ocfs2_path *path = NULL;
1663         struct ocfs2_extent_tree et;
1664         struct ocfs2_refcount_block *rb =
1665                 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1666         struct ocfs2_extent_list *el;
1667
1668         if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL))
1669                 goto out;
1670
1671         rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1672         old_cpos = le32_to_cpu(rb->rf_cpos);
1673         new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
1674         if (old_cpos <= new_cpos)
1675                 goto out;
1676
1677         ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
1678
1679         path = ocfs2_new_path_from_et(&et);
1680         if (!path) {
1681                 ret = -ENOMEM;
1682                 mlog_errno(ret);
1683                 goto out;
1684         }
1685
1686         ret = ocfs2_find_path(ci, path, old_cpos);
1687         if (ret) {
1688                 mlog_errno(ret);
1689                 goto out;
1690         }
1691
1692         /*
1693          * 2 more credits, one for the leaf refcount block, one for
1694          * the extent block contains the extent rec.
1695          */
1696         ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2);
1697         if (ret < 0) {
1698                 mlog_errno(ret);
1699                 goto out;
1700         }
1701
1702         ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1703                                       OCFS2_JOURNAL_ACCESS_WRITE);
1704         if (ret < 0) {
1705                 mlog_errno(ret);
1706                 goto out;
1707         }
1708
1709         ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path),
1710                                       OCFS2_JOURNAL_ACCESS_WRITE);
1711         if (ret < 0) {
1712                 mlog_errno(ret);
1713                 goto out;
1714         }
1715
1716         /* change the leaf extent block first. */
1717         el = path_leaf_el(path);
1718
1719         for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++)
1720                 if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos)
1721                         break;
1722
1723         BUG_ON(i == le16_to_cpu(el->l_next_free_rec));
1724
1725         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1726
1727         /* change the r_cpos in the leaf block. */
1728         rb->rf_cpos = cpu_to_le32(new_cpos);
1729
1730         ocfs2_journal_dirty(handle, path_leaf_bh(path));
1731         ocfs2_journal_dirty(handle, ref_leaf_bh);
1732
1733 out:
1734         ocfs2_free_path(path);
1735         return ret;
1736 }
1737
1738 static int ocfs2_insert_refcount_rec(handle_t *handle,
1739                                      struct ocfs2_caching_info *ci,
1740                                      struct buffer_head *ref_root_bh,
1741                                      struct buffer_head *ref_leaf_bh,
1742                                      struct ocfs2_refcount_rec *rec,
1743                                      int index, int merge,
1744                                      struct ocfs2_alloc_context *meta_ac)
1745 {
1746         int ret;
1747         struct ocfs2_refcount_block *rb =
1748                         (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1749         struct ocfs2_refcount_list *rf_list = &rb->rf_records;
1750         struct buffer_head *new_bh = NULL;
1751
1752         BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
1753
1754         if (rf_list->rl_used == rf_list->rl_count) {
1755                 u64 cpos = le64_to_cpu(rec->r_cpos);
1756                 u32 len = le32_to_cpu(rec->r_clusters);
1757
1758                 ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
1759                                                  ref_leaf_bh, meta_ac);
1760                 if (ret) {
1761                         mlog_errno(ret);
1762                         goto out;
1763                 }
1764
1765                 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1766                                              cpos, len, NULL, &index,
1767                                              &new_bh);
1768                 if (ret) {
1769                         mlog_errno(ret);
1770                         goto out;
1771                 }
1772
1773                 ref_leaf_bh = new_bh;
1774                 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1775                 rf_list = &rb->rf_records;
1776         }
1777
1778         ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1779                                       OCFS2_JOURNAL_ACCESS_WRITE);
1780         if (ret) {
1781                 mlog_errno(ret);
1782                 goto out;
1783         }
1784
1785         if (index < le16_to_cpu(rf_list->rl_used))
1786                 memmove(&rf_list->rl_recs[index + 1],
1787                         &rf_list->rl_recs[index],
1788                         (le16_to_cpu(rf_list->rl_used) - index) *
1789                          sizeof(struct ocfs2_refcount_rec));
1790
1791         mlog(0, "insert refcount record start %llu, len %u, count %u "
1792              "to leaf block %llu at index %d\n",
1793              (unsigned long long)le64_to_cpu(rec->r_cpos),
1794              le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount),
1795              (unsigned long long)ref_leaf_bh->b_blocknr, index);
1796
1797         rf_list->rl_recs[index] = *rec;
1798
1799         le16_add_cpu(&rf_list->rl_used, 1);
1800
1801         if (merge)
1802                 ocfs2_refcount_rec_merge(rb, index);
1803
1804         ocfs2_journal_dirty(handle, ref_leaf_bh);
1805
1806         if (index == 0) {
1807                 ret = ocfs2_adjust_refcount_rec(handle, ci,
1808                                                 ref_root_bh,
1809                                                 ref_leaf_bh, rec);
1810                 if (ret)
1811                         mlog_errno(ret);
1812         }
1813 out:
1814         brelse(new_bh);
1815         return ret;
1816 }
1817
1818 /*
1819  * Split the refcount_rec indexed by "index" in ref_leaf_bh.
1820  * This is much simple than our b-tree code.
1821  * split_rec is the new refcount rec we want to insert.
1822  * If split_rec->r_refcount > 0, we are changing the refcount(in case we
1823  * increase refcount or decrease a refcount to non-zero).
1824  * If split_rec->r_refcount == 0, we are punching a hole in current refcount
1825  * rec( in case we decrease a refcount to zero).
1826  */
1827 static int ocfs2_split_refcount_rec(handle_t *handle,
1828                                     struct ocfs2_caching_info *ci,
1829                                     struct buffer_head *ref_root_bh,
1830                                     struct buffer_head *ref_leaf_bh,
1831                                     struct ocfs2_refcount_rec *split_rec,
1832                                     int index, int merge,
1833                                     struct ocfs2_alloc_context *meta_ac,
1834                                     struct ocfs2_cached_dealloc_ctxt *dealloc)
1835 {
1836         int ret, recs_need;
1837         u32 len;
1838         struct ocfs2_refcount_block *rb =
1839                         (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1840         struct ocfs2_refcount_list *rf_list = &rb->rf_records;
1841         struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
1842         struct ocfs2_refcount_rec *tail_rec = NULL;
1843         struct buffer_head *new_bh = NULL;
1844
1845         BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
1846
1847         mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n",
1848              le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters),
1849              le64_to_cpu(split_rec->r_cpos),
1850              le32_to_cpu(split_rec->r_clusters));
1851
1852         /*
1853          * If we just need to split the header or tail clusters,
1854          * no more recs are needed, just split is OK.
1855          * Otherwise we at least need one new recs.
1856          */
1857         if (!split_rec->r_refcount &&
1858             (split_rec->r_cpos == orig_rec->r_cpos ||
1859              le64_to_cpu(split_rec->r_cpos) +
1860              le32_to_cpu(split_rec->r_clusters) ==
1861              le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
1862                 recs_need = 0;
1863         else
1864                 recs_need = 1;
1865
1866         /*
1867          * We need one more rec if we split in the middle and the new rec have
1868          * some refcount in it.
1869          */
1870         if (split_rec->r_refcount &&
1871             (split_rec->r_cpos != orig_rec->r_cpos &&
1872              le64_to_cpu(split_rec->r_cpos) +
1873              le32_to_cpu(split_rec->r_clusters) !=
1874              le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
1875                 recs_need++;
1876
1877         /* If the leaf block don't have enough record, expand it. */
1878         if (le16_to_cpu(rf_list->rl_used) + recs_need >
1879                                          le16_to_cpu(rf_list->rl_count)) {
1880                 struct ocfs2_refcount_rec tmp_rec;
1881                 u64 cpos = le64_to_cpu(orig_rec->r_cpos);
1882                 len = le32_to_cpu(orig_rec->r_clusters);
1883                 ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
1884                                                  ref_leaf_bh, meta_ac);
1885                 if (ret) {
1886                         mlog_errno(ret);
1887                         goto out;
1888                 }
1889
1890                 /*
1891                  * We have to re-get it since now cpos may be moved to
1892                  * another leaf block.
1893                  */
1894                 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1895                                              cpos, len, &tmp_rec, &index,
1896                                              &new_bh);
1897                 if (ret) {
1898                         mlog_errno(ret);
1899                         goto out;
1900                 }
1901
1902                 ref_leaf_bh = new_bh;
1903                 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1904                 rf_list = &rb->rf_records;
1905                 orig_rec = &rf_list->rl_recs[index];
1906         }
1907
1908         ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1909                                       OCFS2_JOURNAL_ACCESS_WRITE);
1910         if (ret) {
1911                 mlog_errno(ret);
1912                 goto out;
1913         }
1914
1915         /*
1916          * We have calculated out how many new records we need and store
1917          * in recs_need, so spare enough space first by moving the records
1918          * after "index" to the end.
1919          */
1920         if (index != le16_to_cpu(rf_list->rl_used) - 1)
1921                 memmove(&rf_list->rl_recs[index + 1 + recs_need],
1922                         &rf_list->rl_recs[index + 1],
1923                         (le16_to_cpu(rf_list->rl_used) - index - 1) *
1924                          sizeof(struct ocfs2_refcount_rec));
1925
1926         len = (le64_to_cpu(orig_rec->r_cpos) +
1927               le32_to_cpu(orig_rec->r_clusters)) -
1928               (le64_to_cpu(split_rec->r_cpos) +
1929               le32_to_cpu(split_rec->r_clusters));
1930
1931         /*
1932          * If we have "len", the we will split in the tail and move it
1933          * to the end of the space we have just spared.
1934          */
1935         if (len) {
1936                 tail_rec = &rf_list->rl_recs[index + recs_need];
1937
1938                 memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
1939                 le64_add_cpu(&tail_rec->r_cpos,
1940                              le32_to_cpu(tail_rec->r_clusters) - len);
1941                 tail_rec->r_clusters = cpu_to_le32(len);
1942         }
1943
1944         /*
1945          * If the split pos isn't the same as the original one, we need to
1946          * split in the head.
1947          *
1948          * Note: We have the chance that split_rec.r_refcount = 0,
1949          * recs_need = 0 and len > 0, which means we just cut the head from
1950          * the orig_rec and in that case we have done some modification in
1951          * orig_rec above, so the check for r_cpos is faked.
1952          */
1953         if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) {
1954                 len = le64_to_cpu(split_rec->r_cpos) -
1955                       le64_to_cpu(orig_rec->r_cpos);
1956                 orig_rec->r_clusters = cpu_to_le32(len);
1957                 index++;
1958         }
1959
1960         le16_add_cpu(&rf_list->rl_used, recs_need);
1961
1962         if (split_rec->r_refcount) {
1963                 rf_list->rl_recs[index] = *split_rec;
1964                 mlog(0, "insert refcount record start %llu, len %u, count %u "
1965                      "to leaf block %llu at index %d\n",
1966                      (unsigned long long)le64_to_cpu(split_rec->r_cpos),
1967                      le32_to_cpu(split_rec->r_clusters),
1968                      le32_to_cpu(split_rec->r_refcount),
1969                      (unsigned long long)ref_leaf_bh->b_blocknr, index);
1970
1971                 if (merge)
1972                         ocfs2_refcount_rec_merge(rb, index);
1973         }
1974
1975         ocfs2_journal_dirty(handle, ref_leaf_bh);
1976
1977 out:
1978         brelse(new_bh);
1979         return ret;
1980 }
1981
1982 static int __ocfs2_increase_refcount(handle_t *handle,
1983                                      struct ocfs2_caching_info *ci,
1984                                      struct buffer_head *ref_root_bh,
1985                                      u64 cpos, u32 len, int merge,
1986                                      struct ocfs2_alloc_context *meta_ac,
1987                                      struct ocfs2_cached_dealloc_ctxt *dealloc)
1988 {
1989         int ret = 0, index;
1990         struct buffer_head *ref_leaf_bh = NULL;
1991         struct ocfs2_refcount_rec rec;
1992         unsigned int set_len = 0;
1993
1994         mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n",
1995              (unsigned long long)ocfs2_metadata_cache_owner(ci),
1996              (unsigned long long)cpos, len);
1997
1998         while (len) {
1999                 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2000                                              cpos, len, &rec, &index,
2001                                              &ref_leaf_bh);
2002                 if (ret) {
2003                         mlog_errno(ret);
2004                         goto out;
2005                 }
2006
2007                 set_len = le32_to_cpu(rec.r_clusters);
2008
2009                 /*
2010                  * Here we may meet with 3 situations:
2011                  *
2012                  * 1. If we find an already existing record, and the length
2013                  *    is the same, cool, we just need to increase the r_refcount
2014                  *    and it is OK.
2015                  * 2. If we find a hole, just insert it with r_refcount = 1.
2016                  * 3. If we are in the middle of one extent record, split
2017                  *    it.
2018                  */
2019                 if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
2020                     set_len <= len) {
2021                         mlog(0, "increase refcount rec, start %llu, len %u, "
2022                              "count %u\n", (unsigned long long)cpos, set_len,
2023                              le32_to_cpu(rec.r_refcount));
2024                         ret = ocfs2_change_refcount_rec(handle, ci,
2025                                                         ref_leaf_bh, index,
2026                                                         merge, 1);
2027                         if (ret) {
2028                                 mlog_errno(ret);
2029                                 goto out;
2030                         }
2031                 } else if (!rec.r_refcount) {
2032                         rec.r_refcount = cpu_to_le32(1);
2033
2034                         mlog(0, "insert refcount rec, start %llu, len %u\n",
2035                              (unsigned long long)le64_to_cpu(rec.r_cpos),
2036                              set_len);
2037                         ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
2038                                                         ref_leaf_bh,
2039                                                         &rec, index,
2040                                                         merge, meta_ac);
2041                         if (ret) {
2042                                 mlog_errno(ret);
2043                                 goto out;
2044                         }
2045                 } else  {
2046                         set_len = min((u64)(cpos + len),
2047                                       le64_to_cpu(rec.r_cpos) + set_len) - cpos;
2048                         rec.r_cpos = cpu_to_le64(cpos);
2049                         rec.r_clusters = cpu_to_le32(set_len);
2050                         le32_add_cpu(&rec.r_refcount, 1);
2051
2052                         mlog(0, "split refcount rec, start %llu, "
2053                              "len %u, count %u\n",
2054                              (unsigned long long)le64_to_cpu(rec.r_cpos),
2055                              set_len, le32_to_cpu(rec.r_refcount));
2056                         ret = ocfs2_split_refcount_rec(handle, ci,
2057                                                        ref_root_bh, ref_leaf_bh,
2058                                                        &rec, index, merge,
2059                                                        meta_ac, dealloc);
2060                         if (ret) {
2061                                 mlog_errno(ret);
2062                                 goto out;
2063                         }
2064                 }
2065
2066                 cpos += set_len;
2067                 len -= set_len;
2068                 brelse(ref_leaf_bh);
2069                 ref_leaf_bh = NULL;
2070         }
2071
2072 out:
2073         brelse(ref_leaf_bh);
2074         return ret;
2075 }
2076
2077 static int ocfs2_remove_refcount_extent(handle_t *handle,
2078                                 struct ocfs2_caching_info *ci,
2079                                 struct buffer_head *ref_root_bh,
2080                                 struct buffer_head *ref_leaf_bh,
2081                                 struct ocfs2_alloc_context *meta_ac,
2082                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
2083 {
2084         int ret;
2085         struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2086         struct ocfs2_refcount_block *rb =
2087                         (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2088         struct ocfs2_extent_tree et;
2089
2090         BUG_ON(rb->rf_records.rl_used);
2091
2092         ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2093         ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
2094                                   1, meta_ac, dealloc);
2095         if (ret) {
2096                 mlog_errno(ret);
2097                 goto out;
2098         }
2099
2100         ocfs2_remove_from_cache(ci, ref_leaf_bh);
2101
2102         /*
2103          * add the freed block to the dealloc so that it will be freed
2104          * when we run dealloc.
2105          */
2106         ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
2107                                         le16_to_cpu(rb->rf_suballoc_slot),
2108                                         le64_to_cpu(rb->rf_blkno),
2109                                         le16_to_cpu(rb->rf_suballoc_bit));
2110         if (ret) {
2111                 mlog_errno(ret);
2112                 goto out;
2113         }
2114
2115         ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
2116                                       OCFS2_JOURNAL_ACCESS_WRITE);
2117         if (ret) {
2118                 mlog_errno(ret);
2119                 goto out;
2120         }
2121
2122         rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
2123
2124         le32_add_cpu(&rb->rf_clusters, -1);
2125
2126         /*
2127          * check whether we need to restore the root refcount block if
2128          * there is no leaf extent block at atll.
2129          */
2130         if (!rb->rf_list.l_next_free_rec) {
2131                 BUG_ON(rb->rf_clusters);
2132
2133                 mlog(0, "reset refcount tree root %llu to be a record block.\n",
2134                      (unsigned long long)ref_root_bh->b_blocknr);
2135
2136                 rb->rf_flags = 0;
2137                 rb->rf_parent = 0;
2138                 rb->rf_cpos = 0;
2139                 memset(&rb->rf_records, 0, sb->s_blocksize -
2140                        offsetof(struct ocfs2_refcount_block, rf_records));
2141                 rb->rf_records.rl_count =
2142                                 cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
2143         }
2144
2145         ocfs2_journal_dirty(handle, ref_root_bh);
2146
2147 out:
2148         return ret;
2149 }
2150
2151 int ocfs2_increase_refcount(handle_t *handle,
2152                             struct ocfs2_caching_info *ci,
2153                             struct buffer_head *ref_root_bh,
2154                             u64 cpos, u32 len,
2155                             struct ocfs2_alloc_context *meta_ac,
2156                             struct ocfs2_cached_dealloc_ctxt *dealloc)
2157 {
2158         return __ocfs2_increase_refcount(handle, ci, ref_root_bh,
2159                                          cpos, len, 1,
2160                                          meta_ac, dealloc);
2161 }
2162
2163 static int ocfs2_decrease_refcount_rec(handle_t *handle,
2164                                 struct ocfs2_caching_info *ci,
2165                                 struct buffer_head *ref_root_bh,
2166                                 struct buffer_head *ref_leaf_bh,
2167                                 int index, u64 cpos, unsigned int len,
2168                                 struct ocfs2_alloc_context *meta_ac,
2169                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
2170 {
2171         int ret;
2172         struct ocfs2_refcount_block *rb =
2173                         (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2174         struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
2175
2176         BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
2177         BUG_ON(cpos + len >
2178                le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
2179
2180         if (cpos == le64_to_cpu(rec->r_cpos) &&
2181             len == le32_to_cpu(rec->r_clusters))
2182                 ret = ocfs2_change_refcount_rec(handle, ci,
2183                                                 ref_leaf_bh, index, 1, -1);
2184         else {
2185                 struct ocfs2_refcount_rec split = *rec;
2186                 split.r_cpos = cpu_to_le64(cpos);
2187                 split.r_clusters = cpu_to_le32(len);
2188
2189                 le32_add_cpu(&split.r_refcount, -1);
2190
2191                 mlog(0, "split refcount rec, start %llu, "
2192                      "len %u, count %u, original start %llu, len %u\n",
2193                      (unsigned long long)le64_to_cpu(split.r_cpos),
2194                      len, le32_to_cpu(split.r_refcount),
2195                      (unsigned long long)le64_to_cpu(rec->r_cpos),
2196                      le32_to_cpu(rec->r_clusters));
2197                 ret = ocfs2_split_refcount_rec(handle, ci,
2198                                                ref_root_bh, ref_leaf_bh,
2199                                                &split, index, 1,
2200                                                meta_ac, dealloc);
2201         }
2202
2203         if (ret) {
2204                 mlog_errno(ret);
2205                 goto out;
2206         }
2207
2208         /* Remove the leaf refcount block if it contains no refcount record. */
2209         if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
2210                 ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
2211                                                    ref_leaf_bh, meta_ac,
2212                                                    dealloc);
2213                 if (ret)
2214                         mlog_errno(ret);
2215         }
2216
2217 out:
2218         return ret;
2219 }
2220
2221 static int __ocfs2_decrease_refcount(handle_t *handle,
2222                                      struct ocfs2_caching_info *ci,
2223                                      struct buffer_head *ref_root_bh,
2224                                      u64 cpos, u32 len,
2225                                      struct ocfs2_alloc_context *meta_ac,
2226                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
2227                                      int delete)
2228 {
2229         int ret = 0, index = 0;
2230         struct ocfs2_refcount_rec rec;
2231         unsigned int r_count = 0, r_len;
2232         struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2233         struct buffer_head *ref_leaf_bh = NULL;
2234
2235         mlog(0, "Tree owner %llu, decrease refcount start %llu, "
2236              "len %u, delete %u\n",
2237              (unsigned long long)ocfs2_metadata_cache_owner(ci),
2238              (unsigned long long)cpos, len, delete);
2239
2240         while (len) {
2241                 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2242                                              cpos, len, &rec, &index,
2243                                              &ref_leaf_bh);
2244                 if (ret) {
2245                         mlog_errno(ret);
2246                         goto out;
2247                 }
2248
2249                 r_count = le32_to_cpu(rec.r_refcount);
2250                 BUG_ON(r_count == 0);
2251                 if (!delete)
2252                         BUG_ON(r_count > 1);
2253
2254                 r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
2255                               le32_to_cpu(rec.r_clusters)) - cpos;
2256
2257                 ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
2258                                                   ref_leaf_bh, index,
2259                                                   cpos, r_len,
2260                                                   meta_ac, dealloc);
2261                 if (ret) {
2262                         mlog_errno(ret);
2263                         goto out;
2264                 }
2265
2266                 if (le32_to_cpu(rec.r_refcount) == 1 && delete) {
2267                         ret = ocfs2_cache_cluster_dealloc(dealloc,
2268                                           ocfs2_clusters_to_blocks(sb, cpos),
2269                                                           r_len);
2270                         if (ret) {
2271                                 mlog_errno(ret);
2272                                 goto out;
2273                         }
2274                 }
2275
2276                 cpos += r_len;
2277                 len -= r_len;
2278                 brelse(ref_leaf_bh);
2279                 ref_leaf_bh = NULL;
2280         }
2281
2282 out:
2283         brelse(ref_leaf_bh);
2284         return ret;
2285 }
2286
2287 /* Caller must hold refcount tree lock. */
2288 int ocfs2_decrease_refcount(struct inode *inode,
2289                             handle_t *handle, u32 cpos, u32 len,
2290                             struct ocfs2_alloc_context *meta_ac,
2291                             struct ocfs2_cached_dealloc_ctxt *dealloc,
2292                             int delete)
2293 {
2294         int ret;
2295         u64 ref_blkno;
2296         struct ocfs2_inode_info *oi = OCFS2_I(inode);
2297         struct buffer_head *ref_root_bh = NULL;
2298         struct ocfs2_refcount_tree *tree;
2299
2300         BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
2301
2302         ret = ocfs2_get_refcount_block(inode, &ref_blkno);
2303         if (ret) {
2304                 mlog_errno(ret);
2305                 goto out;
2306         }
2307
2308         ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree);
2309         if (ret) {
2310                 mlog_errno(ret);
2311                 goto out;
2312         }
2313
2314         ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
2315                                         &ref_root_bh);
2316         if (ret) {
2317                 mlog_errno(ret);
2318                 goto out;
2319         }
2320
2321         ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
2322                                         cpos, len, meta_ac, dealloc, delete);
2323         if (ret)
2324                 mlog_errno(ret);
2325 out:
2326         brelse(ref_root_bh);
2327         return ret;
2328 }
2329
2330 /*
2331  * Mark the already-existing extent at cpos as refcounted for len clusters.
2332  * This adds the refcount extent flag.
2333  *
2334  * If the existing extent is larger than the request, initiate a
2335  * split. An attempt will be made at merging with adjacent extents.
2336  *
2337  * The caller is responsible for passing down meta_ac if we'll need it.
2338  */
2339 static int ocfs2_mark_extent_refcounted(struct inode *inode,
2340                                 struct ocfs2_extent_tree *et,
2341                                 handle_t *handle, u32 cpos,
2342                                 u32 len, u32 phys,
2343                                 struct ocfs2_alloc_context *meta_ac,
2344                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
2345 {
2346         int ret;
2347
2348         mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n",
2349              inode->i_ino, cpos, len, phys);
2350
2351         if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2352                 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
2353                             "tree, but the feature bit is not set in the "
2354                             "super block.", inode->i_ino);
2355                 ret = -EROFS;
2356                 goto out;
2357         }
2358
2359         ret = ocfs2_change_extent_flag(handle, et, cpos,
2360                                        len, phys, meta_ac, dealloc,
2361                                        OCFS2_EXT_REFCOUNTED, 0);
2362         if (ret)
2363                 mlog_errno(ret);
2364
2365 out:
2366         return ret;
2367 }
2368
2369 /*
2370  * Given some contiguous physical clusters, calculate what we need
2371  * for modifying their refcount.
2372  */
2373 static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2374                                             struct ocfs2_caching_info *ci,
2375                                             struct buffer_head *ref_root_bh,
2376                                             u64 start_cpos,
2377                                             u32 clusters,
2378                                             int *meta_add,
2379                                             int *credits)
2380 {
2381         int ret = 0, index, ref_blocks = 0, recs_add = 0;
2382         u64 cpos = start_cpos;
2383         struct ocfs2_refcount_block *rb;
2384         struct ocfs2_refcount_rec rec;
2385         struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
2386         u32 len;
2387
2388         mlog(0, "start_cpos %llu, clusters %u\n",
2389              (unsigned long long)start_cpos, clusters);
2390         while (clusters) {
2391                 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2392                                              cpos, clusters, &rec,
2393                                              &index, &ref_leaf_bh);
2394                 if (ret) {
2395                         mlog_errno(ret);
2396                         goto out;
2397                 }
2398
2399                 if (ref_leaf_bh != prev_bh) {
2400                         /*
2401                          * Now we encounter a new leaf block, so calculate
2402                          * whether we need to extend the old leaf.
2403                          */
2404                         if (prev_bh) {
2405                                 rb = (struct ocfs2_refcount_block *)
2406                                                         prev_bh->b_data;
2407
2408                                 if (le64_to_cpu(rb->rf_records.rl_used) +
2409                                     recs_add >
2410                                     le16_to_cpu(rb->rf_records.rl_count))
2411                                         ref_blocks++;
2412                         }
2413
2414                         recs_add = 0;
2415                         *credits += 1;
2416                         brelse(prev_bh);
2417                         prev_bh = ref_leaf_bh;
2418                         get_bh(prev_bh);
2419                 }
2420
2421                 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2422
2423                 mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu,"
2424                      "rec->r_clusters %u, rec->r_refcount %u, index %d\n",
2425                      recs_add, (unsigned long long)cpos, clusters,
2426                      (unsigned long long)le64_to_cpu(rec.r_cpos),
2427                      le32_to_cpu(rec.r_clusters),
2428                      le32_to_cpu(rec.r_refcount), index);
2429
2430                 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
2431                           le32_to_cpu(rec.r_clusters)) - cpos;
2432                 /*
2433                  * If the refcount rec already exist, cool. We just need
2434                  * to check whether there is a split. Otherwise we just need
2435                  * to increase the refcount.
2436                  * If we will insert one, increases recs_add.
2437                  *
2438                  * We record all the records which will be inserted to the
2439                  * same refcount block, so that we can tell exactly whether
2440                  * we need a new refcount block or not.
2441                  */
2442                 if (rec.r_refcount) {
2443                         /* Check whether we need a split at the beginning. */
2444                         if (cpos == start_cpos &&
2445                             cpos != le64_to_cpu(rec.r_cpos))
2446                                 recs_add++;
2447
2448                         /* Check whether we need a split in the end. */
2449                         if (cpos + clusters < le64_to_cpu(rec.r_cpos) +
2450                             le32_to_cpu(rec.r_clusters))
2451                                 recs_add++;
2452                 } else
2453                         recs_add++;
2454
2455                 brelse(ref_leaf_bh);
2456                 ref_leaf_bh = NULL;
2457                 clusters -= len;
2458                 cpos += len;
2459         }
2460
2461         if (prev_bh) {
2462                 rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
2463
2464                 if (le64_to_cpu(rb->rf_records.rl_used) + recs_add >
2465                     le16_to_cpu(rb->rf_records.rl_count))
2466                         ref_blocks++;
2467
2468                 *credits += 1;
2469         }
2470
2471         if (!ref_blocks)
2472                 goto out;
2473
2474         mlog(0, "we need ref_blocks %d\n", ref_blocks);
2475         *meta_add += ref_blocks;
2476         *credits += ref_blocks;
2477
2478         /*
2479          * So we may need ref_blocks to insert into the tree.
2480          * That also means we need to change the b-tree and add that number
2481          * of records since we never merge them.
2482          * We need one more block for expansion since the new created leaf
2483          * block is also full and needs split.
2484          */
2485         rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
2486         if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) {
2487                 struct ocfs2_extent_tree et;
2488
2489                 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2490                 *meta_add += ocfs2_extend_meta_needed(et.et_root_el);
2491                 *credits += ocfs2_calc_extend_credits(sb,
2492                                                       et.et_root_el,
2493                                                       ref_blocks);
2494         } else {
2495                 *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
2496                 *meta_add += 1;
2497         }
2498
2499 out:
2500         brelse(ref_leaf_bh);
2501         brelse(prev_bh);
2502         return ret;
2503 }
2504
2505 /*
2506  * For refcount tree, we will decrease some contiguous clusters
2507  * refcount count, so just go through it to see how many blocks
2508  * we gonna touch and whether we need to create new blocks.
2509  *
2510  * Normally the refcount blocks store these refcount should be
2511  * contiguous also, so that we can get the number easily.
2512  * As for meta_ac, we will at most add split 2 refcount record and
2513  * 2 more refcount block, so just check it in a rough way.
2514  *
2515  * Caller must hold refcount tree lock.
2516  */
2517 int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2518                                           struct buffer_head *di_bh,
2519                                           u64 phys_blkno,
2520                                           u32 clusters,
2521                                           int *credits,
2522                                           struct ocfs2_alloc_context **meta_ac)
2523 {
2524         int ret, ref_blocks = 0;
2525         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2526         struct ocfs2_inode_info *oi = OCFS2_I(inode);
2527         struct buffer_head *ref_root_bh = NULL;
2528         struct ocfs2_refcount_tree *tree;
2529         u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
2530
2531         if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2532                 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
2533                             "tree, but the feature bit is not set in the "
2534                             "super block.", inode->i_ino);
2535                 ret = -EROFS;
2536                 goto out;
2537         }
2538
2539         BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
2540
2541         ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
2542                                       le64_to_cpu(di->i_refcount_loc), &tree);
2543         if (ret) {
2544                 mlog_errno(ret);
2545                 goto out;
2546         }
2547
2548         ret = ocfs2_read_refcount_block(&tree->rf_ci,
2549                                         le64_to_cpu(di->i_refcount_loc),
2550                                         &ref_root_bh);
2551         if (ret) {
2552                 mlog_errno(ret);
2553                 goto out;
2554         }
2555
2556         ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
2557                                                &tree->rf_ci,
2558                                                ref_root_bh,
2559                                                start_cpos, clusters,
2560                                                &ref_blocks, credits);
2561         if (ret) {
2562                 mlog_errno(ret);
2563                 goto out;
2564         }
2565
2566         mlog(0, "reserve new metadata %d, credits = %d\n",
2567              ref_blocks, *credits);
2568
2569         if (ref_blocks) {
2570                 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
2571                                                         ref_blocks, meta_ac);
2572                 if (ret)
2573                         mlog_errno(ret);
2574         }
2575
2576 out:
2577         brelse(ref_root_bh);
2578         return ret;
2579 }
2580
2581 #define MAX_CONTIG_BYTES        1048576
2582
2583 static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
2584 {
2585         return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
2586 }
2587
2588 static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
2589 {
2590         return ~(ocfs2_cow_contig_clusters(sb) - 1);
2591 }
2592
2593 /*
2594  * Given an extent that starts at 'start' and an I/O that starts at 'cpos',
2595  * find an offset (start + (n * contig_clusters)) that is closest to cpos
2596  * while still being less than or equal to it.
2597  *
2598  * The goal is to break the extent at a multiple of contig_clusters.
2599  */
2600 static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
2601                                                  unsigned int start,
2602                                                  unsigned int cpos)
2603 {
2604         BUG_ON(start > cpos);
2605
2606         return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
2607 }
2608
2609 /*
2610  * Given a cluster count of len, pad it out so that it is a multiple
2611  * of contig_clusters.
2612  */
2613 static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
2614                                                   unsigned int len)
2615 {
2616         unsigned int padded =
2617                 (len + (ocfs2_cow_contig_clusters(sb) - 1)) &
2618                 ocfs2_cow_contig_mask(sb);
2619
2620         /* Did we wrap? */
2621         if (padded < len)
2622                 padded = UINT_MAX;
2623
2624         return padded;
2625 }
2626
2627 /*
2628  * Calculate out the start and number of virtual clusters we need to to CoW.
2629  *
2630  * cpos is vitual start cluster position we want to do CoW in a
2631  * file and write_len is the cluster length.
2632  * max_cpos is the place where we want to stop CoW intentionally.
2633  *
2634  * Normal we will start CoW from the beginning of extent record cotaining cpos.
2635  * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
2636  * get good I/O from the resulting extent tree.
2637  */
2638 static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
2639                                            struct ocfs2_extent_list *el,
2640                                            u32 cpos,
2641                                            u32 write_len,
2642                                            u32 max_cpos,
2643                                            u32 *cow_start,
2644                                            u32 *cow_len)
2645 {
2646         int ret = 0;
2647         int tree_height = le16_to_cpu(el->l_tree_depth), i;
2648         struct buffer_head *eb_bh = NULL;
2649         struct ocfs2_extent_block *eb = NULL;
2650         struct ocfs2_extent_rec *rec;
2651         unsigned int want_clusters, rec_end = 0;
2652         int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
2653         int leaf_clusters;
2654
2655         BUG_ON(cpos + write_len > max_cpos);
2656
2657         if (tree_height > 0) {
2658                 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
2659                 if (ret) {
2660                         mlog_errno(ret);
2661                         goto out;
2662                 }
2663
2664                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2665                 el = &eb->h_list;
2666
2667                 if (el->l_tree_depth) {
2668                         ocfs2_error(inode->i_sb,
2669                                     "Inode %lu has non zero tree depth in "
2670                                     "leaf block %llu\n", inode->i_ino,
2671                                     (unsigned long long)eb_bh->b_blocknr);
2672                         ret = -EROFS;
2673                         goto out;
2674                 }
2675         }
2676
2677         *cow_len = 0;
2678         for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
2679                 rec = &el->l_recs[i];
2680
2681                 if (ocfs2_is_empty_extent(rec)) {
2682                         mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
2683                                         "index %d\n", inode->i_ino, i);
2684                         continue;
2685                 }
2686
2687                 if (le32_to_cpu(rec->e_cpos) +
2688                     le16_to_cpu(rec->e_leaf_clusters) <= cpos)
2689                         continue;
2690
2691                 if (*cow_len == 0) {
2692                         /*
2693                          * We should find a refcounted record in the
2694                          * first pass.
2695                          */
2696                         BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
2697                         *cow_start = le32_to_cpu(rec->e_cpos);
2698                 }
2699
2700                 /*
2701                  * If we encounter a hole, a non-refcounted record or
2702                  * pass the max_cpos, stop the search.
2703                  */
2704                 if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
2705                     (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) ||
2706                     (max_cpos <= le32_to_cpu(rec->e_cpos)))
2707                         break;
2708
2709                 leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
2710                 rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
2711                 if (rec_end > max_cpos) {
2712                         rec_end = max_cpos;
2713                         leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
2714                 }
2715
2716                 /*
2717                  * How many clusters do we actually need from
2718                  * this extent?  First we see how many we actually
2719                  * need to complete the write.  If that's smaller
2720                  * than contig_clusters, we try for contig_clusters.
2721                  */
2722                 if (!*cow_len)
2723                         want_clusters = write_len;
2724                 else
2725                         want_clusters = (cpos + write_len) -
2726                                 (*cow_start + *cow_len);
2727                 if (want_clusters < contig_clusters)
2728                         want_clusters = contig_clusters;
2729
2730                 /*
2731                  * If the write does not cover the whole extent, we
2732                  * need to calculate how we're going to split the extent.
2733                  * We try to do it on contig_clusters boundaries.
2734                  *
2735                  * Any extent smaller than contig_clusters will be
2736                  * CoWed in its entirety.
2737                  */
2738                 if (leaf_clusters <= contig_clusters)
2739                         *cow_len += leaf_clusters;
2740                 else if (*cow_len || (*cow_start == cpos)) {
2741                         /*
2742                          * This extent needs to be CoW'd from its
2743                          * beginning, so all we have to do is compute
2744                          * how many clusters to grab.  We align
2745                          * want_clusters to the edge of contig_clusters
2746                          * to get better I/O.
2747                          */
2748                         want_clusters = ocfs2_cow_align_length(inode->i_sb,
2749                                                                want_clusters);
2750
2751                         if (leaf_clusters < want_clusters)
2752                                 *cow_len += leaf_clusters;
2753                         else
2754                                 *cow_len += want_clusters;
2755                 } else if ((*cow_start + contig_clusters) >=
2756                            (cpos + write_len)) {
2757                         /*
2758                          * Breaking off contig_clusters at the front
2759                          * of the extent will cover our write.  That's
2760                          * easy.
2761                          */
2762                         *cow_len = contig_clusters;
2763                 } else if ((rec_end - cpos) <= contig_clusters) {
2764                         /*
2765                          * Breaking off contig_clusters at the tail of
2766                          * this extent will cover cpos.
2767                          */
2768                         *cow_start = rec_end - contig_clusters;
2769                         *cow_len = contig_clusters;
2770                 } else if ((rec_end - cpos) <= want_clusters) {
2771                         /*
2772                          * While we can't fit the entire write in this
2773                          * extent, we know that the write goes from cpos
2774                          * to the end of the extent.  Break that off.
2775                          * We try to break it at some multiple of
2776                          * contig_clusters from the front of the extent.
2777                          * Failing that (ie, cpos is within
2778                          * contig_clusters of the front), we'll CoW the
2779                          * entire extent.
2780                          */
2781                         *cow_start = ocfs2_cow_align_start(inode->i_sb,
2782                                                            *cow_start, cpos);
2783                         *cow_len = rec_end - *cow_start;
2784                 } else {
2785                         /*
2786                          * Ok, the entire write lives in the middle of
2787                          * this extent.  Let's try to slice the extent up
2788                          * nicely.  Optimally, our CoW region starts at
2789                          * m*contig_clusters from the beginning of the
2790                          * extent and goes for n*contig_clusters,
2791                          * covering the entire write.
2792                          */
2793                         *cow_start = ocfs2_cow_align_start(inode->i_sb,
2794                                                            *cow_start, cpos);
2795
2796                         want_clusters = (cpos + write_len) - *cow_start;
2797                         want_clusters = ocfs2_cow_align_length(inode->i_sb,
2798                                                                want_clusters);
2799                         if (*cow_start + want_clusters <= rec_end)
2800                                 *cow_len = want_clusters;
2801                         else
2802                                 *cow_len = rec_end - *cow_start;
2803                 }
2804
2805                 /* Have we covered our entire write yet? */
2806                 if ((*cow_start + *cow_len) >= (cpos + write_len))
2807                         break;
2808
2809                 /*
2810                  * If we reach the end of the extent block and don't get enough
2811                  * clusters, continue with the next extent block if possible.
2812                  */
2813                 if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
2814                     eb && eb->h_next_leaf_blk) {
2815                         brelse(eb_bh);
2816                         eb_bh = NULL;
2817
2818                         ret = ocfs2_read_extent_block(INODE_CACHE(inode),
2819                                                le64_to_cpu(eb->h_next_leaf_blk),
2820                                                &eb_bh);
2821                         if (ret) {
2822                                 mlog_errno(ret);
2823                                 goto out;
2824                         }
2825
2826                         eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2827                         el = &eb->h_list;
2828                         i = -1;
2829                 }
2830         }
2831
2832 out:
2833         brelse(eb_bh);
2834         return ret;
2835 }
2836
2837 /*
2838  * Prepare meta_ac, data_ac and calculate credits when we want to add some
2839  * num_clusters in data_tree "et" and change the refcount for the old
2840  * clusters(starting form p_cluster) in the refcount tree.
2841  *
2842  * Note:
2843  * 1. since we may split the old tree, so we at most will need num_clusters + 2
2844  *    more new leaf records.
2845  * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
2846  *    just give data_ac = NULL.
2847  */
2848 static int ocfs2_lock_refcount_allocators(struct super_block *sb,
2849                                         u32 p_cluster, u32 num_clusters,
2850                                         struct ocfs2_extent_tree *et,
2851                                         struct ocfs2_caching_info *ref_ci,
2852                                         struct buffer_head *ref_root_bh,
2853                                         struct ocfs2_alloc_context **meta_ac,
2854                                         struct ocfs2_alloc_context **data_ac,
2855                                         int *credits)
2856 {
2857         int ret = 0, meta_add = 0;
2858         int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
2859
2860         if (num_free_extents < 0) {
2861                 ret = num_free_extents;
2862                 mlog_errno(ret);
2863                 goto out;
2864         }
2865
2866         if (num_free_extents < num_clusters + 2)
2867                 meta_add =
2868                         ocfs2_extend_meta_needed(et->et_root_el);
2869
2870         *credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
2871                                               num_clusters + 2);
2872
2873         ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
2874                                                p_cluster, num_clusters,
2875                                                &meta_add, credits);
2876         if (ret) {
2877                 mlog_errno(ret);
2878                 goto out;
2879         }
2880
2881         mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n",
2882              meta_add, num_clusters, *credits);
2883         ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
2884                                                 meta_ac);
2885         if (ret) {
2886                 mlog_errno(ret);
2887                 goto out;
2888         }
2889
2890         if (data_ac) {
2891                 ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
2892                                              data_ac);
2893                 if (ret)
2894                         mlog_errno(ret);
2895         }
2896
2897 out:
2898         if (ret) {
2899                 if (*meta_ac) {
2900                         ocfs2_free_alloc_context(*meta_ac);
2901                         *meta_ac = NULL;
2902                 }
2903         }
2904
2905         return ret;
2906 }
2907
2908 static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
2909 {
2910         BUG_ON(buffer_dirty(bh));
2911
2912         clear_buffer_mapped(bh);
2913
2914         return 0;
2915 }
2916
2917 static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2918                                             struct ocfs2_cow_context *context,
2919                                             u32 cpos, u32 old_cluster,
2920                                             u32 new_cluster, u32 new_len)
2921 {
2922         int ret = 0, partial;
2923         struct ocfs2_caching_info *ci = context->data_et.et_ci;
2924         struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2925         u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2926         struct page *page;
2927         pgoff_t page_index;
2928         unsigned int from, to;
2929         loff_t offset, end, map_end;
2930         struct address_space *mapping = context->inode->i_mapping;
2931
2932         mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
2933              new_cluster, new_len, cpos);
2934
2935         offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2936         end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
2937
2938         while (offset < end) {
2939                 page_index = offset >> PAGE_CACHE_SHIFT;
2940                 map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
2941                 if (map_end > end)
2942                         map_end = end;
2943
2944                 /* from, to is the offset within the page. */
2945                 from = offset & (PAGE_CACHE_SIZE - 1);
2946                 to = PAGE_CACHE_SIZE;
2947                 if (map_end & (PAGE_CACHE_SIZE - 1))
2948                         to = map_end & (PAGE_CACHE_SIZE - 1);
2949
2950                 page = grab_cache_page(mapping, page_index);
2951
2952                 /*
2953                  * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
2954                  * can't be dirtied before we CoW it out.
2955                  */
2956                 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
2957                         BUG_ON(PageDirty(page));
2958
2959                 if (!PageUptodate(page)) {
2960                         ret = block_read_full_page(page, ocfs2_get_block);
2961                         if (ret) {
2962                                 mlog_errno(ret);
2963                                 goto unlock;
2964                         }
2965                         lock_page(page);
2966                 }
2967
2968                 if (page_has_buffers(page)) {
2969                         ret = walk_page_buffers(handle, page_buffers(page),
2970                                                 from, to, &partial,
2971                                                 ocfs2_clear_cow_buffer);
2972                         if (ret) {
2973                                 mlog_errno(ret);
2974                                 goto unlock;
2975                         }
2976                 }
2977
2978                 ocfs2_map_and_dirty_page(context->inode,
2979                                          handle, from, to,
2980                                          page, 0, &new_block);
2981                 mark_page_accessed(page);
2982 unlock:
2983                 unlock_page(page);
2984                 page_cache_release(page);
2985                 page = NULL;
2986                 offset = map_end;
2987                 if (ret)
2988                         break;
2989         }
2990
2991         return ret;
2992 }
2993
2994 static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
2995                                            struct ocfs2_cow_context *context,
2996                                            u32 cpos, u32 old_cluster,
2997                                            u32 new_cluster, u32 new_len)
2998 {
2999         int ret = 0;
3000         struct super_block *sb = context->inode->i_sb;
3001         struct ocfs2_caching_info *ci = context->data_et.et_ci;
3002         int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
3003         u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
3004         u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
3005         struct ocfs2_super *osb = OCFS2_SB(sb);
3006         struct buffer_head *old_bh = NULL;
3007         struct buffer_head *new_bh = NULL;
3008
3009         mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster,
3010              new_cluster, new_len);
3011
3012         for (i = 0; i < blocks; i++, old_block++, new_block++) {
3013                 new_bh = sb_getblk(osb->sb, new_block);
3014                 if (new_bh == NULL) {
3015                         ret = -EIO;
3016                         mlog_errno(ret);
3017                         break;
3018                 }
3019
3020                 ocfs2_set_new_buffer_uptodate(ci, new_bh);
3021
3022                 ret = ocfs2_read_block(ci, old_block, &old_bh, NULL);
3023                 if (ret) {
3024                         mlog_errno(ret);
3025                         break;
3026                 }
3027
3028                 ret = ocfs2_journal_access(handle, ci, new_bh,
3029                                            OCFS2_JOURNAL_ACCESS_CREATE);
3030                 if (ret) {
3031                         mlog_errno(ret);
3032                         break;
3033                 }
3034
3035                 memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
3036                 ocfs2_journal_dirty(handle, new_bh);
3037
3038                 brelse(new_bh);
3039                 brelse(old_bh);
3040                 new_bh = NULL;
3041                 old_bh = NULL;
3042         }
3043
3044         brelse(new_bh);
3045         brelse(old_bh);
3046         return ret;
3047 }
3048
3049 static int ocfs2_clear_ext_refcount(handle_t *handle,
3050                                     struct ocfs2_extent_tree *et,
3051                                     u32 cpos, u32 p_cluster, u32 len,
3052                                     unsigned int ext_flags,
3053                                     struct ocfs2_alloc_context *meta_ac,
3054                                     struct ocfs2_cached_dealloc_ctxt *dealloc)
3055 {
3056         int ret, index;
3057         struct ocfs2_extent_rec replace_rec;
3058         struct ocfs2_path *path = NULL;
3059         struct ocfs2_extent_list *el;
3060         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
3061         u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
3062
3063         mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n",
3064              (unsigned long long)ino, cpos, len, p_cluster, ext_flags);
3065
3066         memset(&replace_rec, 0, sizeof(replace_rec));
3067         replace_rec.e_cpos = cpu_to_le32(cpos);
3068         replace_rec.e_leaf_clusters = cpu_to_le16(len);
3069         replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
3070                                                                    p_cluster));
3071         replace_rec.e_flags = ext_flags;
3072         replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
3073
3074         path = ocfs2_new_path_from_et(et);
3075         if (!path) {
3076                 ret = -ENOMEM;
3077                 mlog_errno(ret);
3078                 goto out;
3079         }
3080
3081         ret = ocfs2_find_path(et->et_ci, path, cpos);
3082         if (ret) {
3083                 mlog_errno(ret);
3084                 goto out;
3085         }
3086
3087         el = path_leaf_el(path);
3088
3089         index = ocfs2_search_extent_list(el, cpos);
3090         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
3091                 ocfs2_error(sb,
3092                             "Inode %llu has an extent at cpos %u which can no "
3093                             "longer be found.\n",
3094                             (unsigned long long)ino, cpos);
3095                 ret = -EROFS;
3096                 goto out;
3097         }
3098
3099         ret = ocfs2_split_extent(handle, et, path, index,
3100                                  &replace_rec, meta_ac, dealloc);
3101         if (ret)
3102                 mlog_errno(ret);
3103
3104 out:
3105         ocfs2_free_path(path);
3106         return ret;
3107 }
3108
3109 static int ocfs2_replace_clusters(handle_t *handle,
3110                                   struct ocfs2_cow_context *context,
3111                                   u32 cpos, u32 old,
3112                                   u32 new, u32 len,
3113                                   unsigned int ext_flags)
3114 {
3115         int ret;
3116         struct ocfs2_caching_info *ci = context->data_et.et_ci;
3117         u64 ino = ocfs2_metadata_cache_owner(ci);
3118
3119         mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n",
3120              (unsigned long long)ino, cpos, old, new, len, ext_flags);
3121
3122         /*If the old clusters is unwritten, no need to duplicate. */
3123         if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
3124                 ret = context->cow_duplicate_clusters(handle, context, cpos,
3125                                                       old, new, len);
3126                 if (ret) {
3127                         mlog_errno(ret);
3128                         goto out;
3129                 }
3130         }
3131
3132         ret = ocfs2_clear_ext_refcount(handle, &context->data_et,
3133                                        cpos, new, len, ext_flags,
3134                                        context->meta_ac, &context->dealloc);
3135         if (ret)
3136                 mlog_errno(ret);
3137 out:
3138         return ret;
3139 }
3140
3141 static int ocfs2_cow_sync_writeback(struct super_block *sb,
3142                                     struct ocfs2_cow_context *context,
3143                                     u32 cpos, u32 num_clusters)
3144 {
3145         int ret = 0;
3146         loff_t offset, end, map_end;
3147         pgoff_t page_index;
3148         struct page *page;
3149
3150         if (ocfs2_should_order_data(context->inode))
3151                 return 0;
3152
3153         offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
3154         end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
3155
3156         ret = filemap_fdatawrite_range(context->inode->i_mapping,
3157                                        offset, end - 1);
3158         if (ret < 0) {
3159                 mlog_errno(ret);
3160                 return ret;
3161         }
3162
3163         while (offset < end) {
3164                 page_index = offset >> PAGE_CACHE_SHIFT;
3165                 map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
3166                 if (map_end > end)
3167                         map_end = end;
3168
3169                 page = grab_cache_page(context->inode->i_mapping, page_index);
3170                 BUG_ON(!page);
3171
3172                 wait_on_page_writeback(page);
3173                 if (PageError(page)) {
3174                         ret = -EIO;
3175                         mlog_errno(ret);
3176                 } else
3177                         mark_page_accessed(page);
3178
3179                 unlock_page(page);
3180                 page_cache_release(page);
3181                 page = NULL;
3182                 offset = map_end;
3183                 if (ret)
3184                         break;
3185         }
3186
3187         return ret;
3188 }
3189
3190 static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context,
3191                                  u32 v_cluster, u32 *p_cluster,
3192                                  u32 *num_clusters,
3193                                  unsigned int *extent_flags)
3194 {
3195         return ocfs2_get_clusters(context->inode, v_cluster, p_cluster,
3196                                   num_clusters, extent_flags);
3197 }
3198
3199 static int ocfs2_make_clusters_writable(struct super_block *sb,
3200                                         struct ocfs2_cow_context *context,
3201                                         u32 cpos, u32 p_cluster,
3202                                         u32 num_clusters, unsigned int e_flags)
3203 {
3204         int ret, delete, index, credits =  0;
3205         u32 new_bit, new_len;
3206         unsigned int set_len;
3207         struct ocfs2_super *osb = OCFS2_SB(sb);
3208         handle_t *handle;
3209         struct buffer_head *ref_leaf_bh = NULL;
3210         struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
3211         struct ocfs2_refcount_rec rec;
3212
3213         mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n",
3214              cpos, p_cluster, num_clusters, e_flags);
3215
3216         ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
3217                                              &context->data_et,
3218                                              ref_ci,
3219                                              context->ref_root_bh,
3220                                              &context->meta_ac,
3221                                              &context->data_ac, &credits);
3222         if (ret) {
3223                 mlog_errno(ret);
3224                 return ret;
3225         }
3226
3227         if (context->post_refcount)
3228                 credits += context->post_refcount->credits;
3229
3230         credits += context->extra_credits;
3231         handle = ocfs2_start_trans(osb, credits);
3232         if (IS_ERR(handle)) {
3233                 ret = PTR_ERR(handle);
3234                 mlog_errno(ret);
3235                 goto out;
3236         }
3237
3238         while (num_clusters) {
3239                 ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
3240                                              p_cluster, num_clusters,
3241                                              &rec, &index, &ref_leaf_bh);
3242                 if (ret) {
3243                         mlog_errno(ret);
3244                         goto out_commit;
3245                 }
3246
3247                 BUG_ON(!rec.r_refcount);
3248                 set_len = min((u64)p_cluster + num_clusters,
3249                               le64_to_cpu(rec.r_cpos) +
3250                               le32_to_cpu(rec.r_clusters)) - p_cluster;
3251
3252                 /*
3253                  * There are many different situation here.
3254                  * 1. If refcount == 1, remove the flag and don't COW.
3255                  * 2. If refcount > 1, allocate clusters.
3256                  *    Here we may not allocate r_len once at a time, so continue
3257                  *    until we reach num_clusters.
3258                  */
3259                 if (le32_to_cpu(rec.r_refcount) == 1) {
3260                         delete = 0;
3261                         ret = ocfs2_clear_ext_refcount(handle,
3262                                                        &context->data_et,
3263                                                        cpos, p_cluster,
3264                                                        set_len, e_flags,
3265                                                        context->meta_ac,
3266                                                        &context->dealloc);
3267                         if (ret) {
3268                                 mlog_errno(ret);
3269                                 goto out_commit;
3270                         }
3271                 } else {
3272                         delete = 1;
3273
3274                         ret = __ocfs2_claim_clusters(osb, handle,
3275                                                      context->data_ac,
3276                                                      1, set_len,
3277                                                      &new_bit, &new_len);
3278                         if (ret) {
3279                                 mlog_errno(ret);
3280                                 goto out_commit;
3281                         }
3282
3283                         ret = ocfs2_replace_clusters(handle, context,
3284                                                      cpos, p_cluster, new_bit,
3285                                                      new_len, e_flags);
3286                         if (ret) {
3287                                 mlog_errno(ret);
3288                                 goto out_commit;
3289                         }
3290                         set_len = new_len;
3291                 }
3292
3293                 ret = __ocfs2_decrease_refcount(handle, ref_ci,
3294                                                 context->ref_root_bh,
3295                                                 p_cluster, set_len,
3296                                                 context->meta_ac,
3297                                                 &context->dealloc, delete);
3298                 if (ret) {
3299                         mlog_errno(ret);
3300                         goto out_commit;
3301                 }
3302
3303                 cpos += set_len;
3304                 p_cluster += set_len;
3305                 num_clusters -= set_len;
3306                 brelse(ref_leaf_bh);
3307                 ref_leaf_bh = NULL;
3308         }
3309
3310         /* handle any post_cow action. */
3311         if (context->post_refcount && context->post_refcount->func) {
3312                 ret = context->post_refcount->func(context->inode, handle,
3313                                                 context->post_refcount->para);
3314                 if (ret) {
3315                         mlog_errno(ret);
3316                         goto out_commit;
3317                 }
3318         }
3319
3320         /*
3321          * Here we should write the new page out first if we are
3322          * in write-back mode.
3323          */
3324         if (context->get_clusters == ocfs2_di_get_clusters) {
3325                 ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters);
3326                 if (ret)
3327                         mlog_errno(ret);
3328         }
3329
3330 out_commit:
3331         ocfs2_commit_trans(osb, handle);
3332
3333 out:
3334         if (context->data_ac) {
3335                 ocfs2_free_alloc_context(context->data_ac);
3336                 context->data_ac = NULL;
3337         }
3338         if (context->meta_ac) {
3339                 ocfs2_free_alloc_context(context->meta_ac);
3340                 context->meta_ac = NULL;
3341         }
3342         brelse(ref_leaf_bh);
3343
3344         return ret;
3345 }
3346
3347 static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
3348 {
3349         int ret = 0;
3350         struct inode *inode = context->inode;
3351         u32 cow_start = context->cow_start, cow_len = context->cow_len;
3352         u32 p_cluster, num_clusters;
3353         unsigned int ext_flags;
3354         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3355
3356         if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
3357                 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
3358                             "tree, but the feature bit is not set in the "
3359                             "super block.", inode->i_ino);
3360                 return -EROFS;
3361         }
3362
3363         ocfs2_init_dealloc_ctxt(&context->dealloc);
3364
3365         while (cow_len) {
3366                 ret = context->get_clusters(context, cow_start, &p_cluster,
3367                                             &num_clusters, &ext_flags);
3368                 if (ret) {
3369                         mlog_errno(ret);
3370                         break;
3371                 }
3372
3373                 BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
3374
3375                 if (cow_len < num_clusters)
3376                         num_clusters = cow_len;
3377
3378                 ret = ocfs2_make_clusters_writable(inode->i_sb, context,
3379                                                    cow_start, p_cluster,
3380                                                    num_clusters, ext_flags);
3381                 if (ret) {
3382                         mlog_errno(ret);
3383                         break;
3384                 }
3385
3386                 cow_len -= num_clusters;
3387                 cow_start += num_clusters;
3388         }
3389
3390         if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
3391                 ocfs2_schedule_truncate_log_flush(osb, 1);
3392                 ocfs2_run_deallocs(osb, &context->dealloc);
3393         }
3394
3395         return ret;
3396 }
3397
3398 /*
3399  * Starting at cpos, try to CoW write_len clusters.  Don't CoW
3400  * past max_cpos.  This will stop when it runs into a hole or an
3401  * unrefcounted extent.
3402  */
3403 static int ocfs2_refcount_cow_hunk(struct inode *inode,
3404                                    struct buffer_head *di_bh,
3405                                    u32 cpos, u32 write_len, u32 max_cpos)
3406 {
3407         int ret;
3408         u32 cow_start = 0, cow_len = 0;
3409         struct ocfs2_inode_info *oi = OCFS2_I(inode);
3410         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3411         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3412         struct buffer_head *ref_root_bh = NULL;
3413         struct ocfs2_refcount_tree *ref_tree;
3414         struct ocfs2_cow_context *context = NULL;
3415
3416         BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
3417
3418         ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
3419                                               cpos, write_len, max_cpos,
3420                                               &cow_start, &cow_len);
3421         if (ret) {
3422                 mlog_errno(ret);
3423                 goto out;
3424         }
3425
3426         mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, "
3427              "cow_len %u\n", inode->i_ino,
3428              cpos, write_len, cow_start, cow_len);
3429
3430         BUG_ON(cow_len == 0);
3431
3432         context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3433         if (!context) {
3434                 ret = -ENOMEM;
3435                 mlog_errno(ret);
3436                 goto out;
3437         }
3438
3439         ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
3440                                        1, &ref_tree, &ref_root_bh);
3441         if (ret) {
3442                 mlog_errno(ret);
3443                 goto out;
3444         }
3445
3446         context->inode = inode;
3447         context->cow_start = cow_start;
3448         context->cow_len = cow_len;
3449         context->ref_tree = ref_tree;
3450         context->ref_root_bh = ref_root_bh;
3451         context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
3452         context->get_clusters = ocfs2_di_get_clusters;
3453
3454         ocfs2_init_dinode_extent_tree(&context->data_et,
3455                                       INODE_CACHE(inode), di_bh);
3456
3457         ret = ocfs2_replace_cow(context);
3458         if (ret)
3459                 mlog_errno(ret);
3460
3461         /*
3462          * truncate the extent map here since no matter whether we meet with
3463          * any error during the action, we shouldn't trust cached extent map
3464          * any more.
3465          */
3466         ocfs2_extent_map_trunc(inode, cow_start);
3467
3468         ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3469         brelse(ref_root_bh);
3470 out:
3471         kfree(context);
3472         return ret;
3473 }
3474
3475 /*
3476  * CoW any and all clusters between cpos and cpos+write_len.
3477  * Don't CoW past max_cpos.  If this returns successfully, all
3478  * clusters between cpos and cpos+write_len are safe to modify.
3479  */
3480 int ocfs2_refcount_cow(struct inode *inode,
3481                        struct buffer_head *di_bh,
3482                        u32 cpos, u32 write_len, u32 max_cpos)
3483 {
3484         int ret = 0;
3485         u32 p_cluster, num_clusters;
3486         unsigned int ext_flags;
3487
3488         while (write_len) {
3489                 ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3490                                          &num_clusters, &ext_flags);
3491                 if (ret) {
3492                         mlog_errno(ret);
3493                         break;
3494                 }
3495
3496                 if (write_len < num_clusters)
3497                         num_clusters = write_len;
3498
3499                 if (ext_flags & OCFS2_EXT_REFCOUNTED) {
3500                         ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
3501                                                       num_clusters, max_cpos);
3502                         if (ret) {
3503                                 mlog_errno(ret);
3504                                 break;
3505                         }
3506                 }
3507
3508                 write_len -= num_clusters;
3509                 cpos += num_clusters;
3510         }
3511
3512         return ret;
3513 }
3514
3515 static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context,
3516                                           u32 v_cluster, u32 *p_cluster,
3517                                           u32 *num_clusters,
3518                                           unsigned int *extent_flags)
3519 {
3520         struct inode *inode = context->inode;
3521         struct ocfs2_xattr_value_root *xv = context->cow_object;
3522
3523         return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster,
3524                                         num_clusters, &xv->xr_list,
3525                                         extent_flags);
3526 }
3527
3528 /*
3529  * Given a xattr value root, calculate the most meta/credits we need for
3530  * refcount tree change if we truncate it to 0.
3531  */
3532 int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
3533                                        struct ocfs2_caching_info *ref_ci,
3534                                        struct buffer_head *ref_root_bh,
3535                                        struct ocfs2_xattr_value_root *xv,
3536                                        int *meta_add, int *credits)
3537 {
3538         int ret = 0, index, ref_blocks = 0;
3539         u32 p_cluster, num_clusters;
3540         u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters);
3541         struct ocfs2_refcount_block *rb;
3542         struct ocfs2_refcount_rec rec;
3543         struct buffer_head *ref_leaf_bh = NULL;
3544
3545         while (cpos < clusters) {
3546                 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
3547                                                &num_clusters, &xv->xr_list,
3548                                                NULL);
3549                 if (ret) {
3550                         mlog_errno(ret);
3551                         goto out;
3552                 }
3553
3554                 cpos += num_clusters;
3555
3556                 while (num_clusters) {
3557                         ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh,
3558                                                      p_cluster, num_clusters,
3559                                                      &rec, &index,
3560                                                      &ref_leaf_bh);
3561                         if (ret) {
3562                                 mlog_errno(ret);
3563                                 goto out;
3564                         }
3565
3566                         BUG_ON(!rec.r_refcount);
3567
3568                         rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
3569
3570                         /*
3571                          * We really don't know whether the other clusters is in
3572                          * this refcount block or not, so just take the worst
3573                          * case that all the clusters are in this block and each
3574                          * one will split a refcount rec, so totally we need
3575                          * clusters * 2 new refcount rec.
3576                          */
3577                         if (le64_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
3578                             le16_to_cpu(rb->rf_records.rl_count))
3579                                 ref_blocks++;
3580
3581                         *credits += 1;
3582                         brelse(ref_leaf_bh);
3583                         ref_leaf_bh = NULL;
3584
3585                         if (num_clusters <= le32_to_cpu(rec.r_clusters))
3586                                 break;
3587                         else
3588                                 num_clusters -= le32_to_cpu(rec.r_clusters);
3589                         p_cluster += num_clusters;
3590                 }
3591         }
3592
3593         *meta_add += ref_blocks;
3594         if (!ref_blocks)
3595                 goto out;
3596
3597         rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
3598         if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
3599                 *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
3600         else {
3601                 struct ocfs2_extent_tree et;
3602
3603                 ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
3604                 *credits += ocfs2_calc_extend_credits(inode->i_sb,
3605                                                       et.et_root_el,
3606                                                       ref_blocks);
3607         }
3608
3609 out:
3610         brelse(ref_leaf_bh);
3611         return ret;
3612 }
3613
3614 /*
3615  * Do CoW for xattr.
3616  */
3617 int ocfs2_refcount_cow_xattr(struct inode *inode,
3618                              struct ocfs2_dinode *di,
3619                              struct ocfs2_xattr_value_buf *vb,
3620                              struct ocfs2_refcount_tree *ref_tree,
3621                              struct buffer_head *ref_root_bh,
3622                              u32 cpos, u32 write_len,
3623                              struct ocfs2_post_refcount *post)
3624 {
3625         int ret;
3626         struct ocfs2_xattr_value_root *xv = vb->vb_xv;
3627         struct ocfs2_inode_info *oi = OCFS2_I(inode);
3628         struct ocfs2_cow_context *context = NULL;
3629         u32 cow_start, cow_len;
3630
3631         BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
3632
3633         ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
3634                                               cpos, write_len, UINT_MAX,
3635                                               &cow_start, &cow_len);
3636         if (ret) {
3637                 mlog_errno(ret);
3638                 goto out;
3639         }
3640
3641         BUG_ON(cow_len == 0);
3642
3643         context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3644         if (!context) {
3645                 ret = -ENOMEM;
3646                 mlog_errno(ret);
3647                 goto out;
3648         }
3649
3650         context->inode = inode;
3651         context->cow_start = cow_start;
3652         context->cow_len = cow_len;
3653         context->ref_tree = ref_tree;
3654         context->ref_root_bh = ref_root_bh;;
3655         context->cow_object = xv;
3656
3657         context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
3658         /* We need the extra credits for duplicate_clusters by jbd. */
3659         context->extra_credits =
3660                 ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len;
3661         context->get_clusters = ocfs2_xattr_value_get_clusters;
3662         context->post_refcount = post;
3663
3664         ocfs2_init_xattr_value_extent_tree(&context->data_et,
3665                                            INODE_CACHE(inode), vb);
3666
3667         ret = ocfs2_replace_cow(context);
3668         if (ret)
3669                 mlog_errno(ret);
3670
3671 out:
3672         kfree(context);
3673         return ret;
3674 }
3675
3676 /*
3677  * Insert a new extent into refcount tree and mark a extent rec
3678  * as refcounted in the dinode tree.
3679  */
3680 int ocfs2_add_refcount_flag(struct inode *inode,
3681                             struct ocfs2_extent_tree *data_et,
3682                             struct ocfs2_caching_info *ref_ci,
3683                             struct buffer_head *ref_root_bh,
3684                             u32 cpos, u32 p_cluster, u32 num_clusters,
3685                             struct ocfs2_cached_dealloc_ctxt *dealloc,
3686                             struct ocfs2_post_refcount *post)
3687 {
3688         int ret;
3689         handle_t *handle;
3690         int credits = 1, ref_blocks = 0;
3691         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3692         struct ocfs2_alloc_context *meta_ac = NULL;
3693
3694         ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
3695                                                ref_ci, ref_root_bh,
3696                                                p_cluster, num_clusters,
3697                                                &ref_blocks, &credits);
3698         if (ret) {
3699                 mlog_errno(ret);
3700                 goto out;
3701         }
3702
3703         mlog(0, "reserve new metadata %d, credits = %d\n",
3704              ref_blocks, credits);
3705
3706         if (ref_blocks) {
3707                 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
3708                                                         ref_blocks, &meta_ac);
3709                 if (ret) {
3710                         mlog_errno(ret);
3711                         goto out;
3712                 }
3713         }
3714
3715         if (post)
3716                 credits += post->credits;
3717
3718         handle = ocfs2_start_trans(osb, credits);
3719         if (IS_ERR(handle)) {
3720                 ret = PTR_ERR(handle);
3721                 mlog_errno(ret);
3722                 goto out;
3723         }
3724
3725         ret = ocfs2_mark_extent_refcounted(inode, data_et, handle,
3726                                            cpos, num_clusters, p_cluster,
3727                                            meta_ac, dealloc);
3728         if (ret) {
3729                 mlog_errno(ret);
3730                 goto out_commit;
3731         }
3732
3733         ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
3734                                         p_cluster, num_clusters, 0,
3735                                         meta_ac, dealloc);
3736         if (ret) {
3737                 mlog_errno(ret);
3738                 goto out_commit;
3739         }
3740
3741         if (post && post->func) {
3742                 ret = post->func(inode, handle, post->para);
3743                 if (ret)
3744                         mlog_errno(ret);
3745         }
3746
3747 out_commit:
3748         ocfs2_commit_trans(osb, handle);
3749 out:
3750         if (meta_ac)
3751                 ocfs2_free_alloc_context(meta_ac);
3752         return ret;
3753 }
3754
3755 static int ocfs2_change_ctime(struct inode *inode,
3756                               struct buffer_head *di_bh)
3757 {
3758         int ret;
3759         handle_t *handle;
3760         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3761
3762         handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
3763                                    OCFS2_INODE_UPDATE_CREDITS);
3764         if (IS_ERR(handle)) {
3765                 ret = PTR_ERR(handle);
3766                 mlog_errno(ret);
3767                 goto out;
3768         }
3769
3770         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
3771                                       OCFS2_JOURNAL_ACCESS_WRITE);
3772         if (ret) {
3773                 mlog_errno(ret);
3774                 goto out_commit;
3775         }
3776
3777         inode->i_ctime = CURRENT_TIME;
3778         di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
3779         di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
3780
3781         ocfs2_journal_dirty(handle, di_bh);
3782
3783 out_commit:
3784         ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
3785 out:
3786         return ret;
3787 }
3788
3789 static int ocfs2_attach_refcount_tree(struct inode *inode,
3790                                       struct buffer_head *di_bh)
3791 {
3792         int ret, data_changed = 0;
3793         struct buffer_head *ref_root_bh = NULL;
3794         struct ocfs2_inode_info *oi = OCFS2_I(inode);
3795         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3796         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3797         struct ocfs2_refcount_tree *ref_tree;
3798         unsigned int ext_flags;
3799         loff_t size;
3800         u32 cpos, num_clusters, clusters, p_cluster;
3801         struct ocfs2_cached_dealloc_ctxt dealloc;
3802         struct ocfs2_extent_tree di_et;
3803
3804         ocfs2_init_dealloc_ctxt(&dealloc);
3805
3806         if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
3807                 ret = ocfs2_create_refcount_tree(inode, di_bh);
3808                 if (ret) {
3809                         mlog_errno(ret);
3810                         goto out;
3811                 }
3812         }
3813
3814         BUG_ON(!di->i_refcount_loc);
3815         ret = ocfs2_lock_refcount_tree(osb,
3816                                        le64_to_cpu(di->i_refcount_loc), 1,
3817                                        &ref_tree, &ref_root_bh);
3818         if (ret) {
3819                 mlog_errno(ret);
3820                 goto out;
3821         }
3822
3823         if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
3824                 goto attach_xattr;
3825
3826         ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
3827
3828         size = i_size_read(inode);
3829         clusters = ocfs2_clusters_for_bytes(inode->i_sb, size);
3830
3831         cpos = 0;
3832         while (cpos < clusters) {
3833                 ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3834                                          &num_clusters, &ext_flags);
3835
3836                 if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
3837                         ret = ocfs2_add_refcount_flag(inode, &di_et,
3838                                                       &ref_tree->rf_ci,
3839                                                       ref_root_bh, cpos,
3840                                                       p_cluster, num_clusters,
3841                                                       &dealloc, NULL);
3842                         if (ret) {
3843                                 mlog_errno(ret);
3844                                 goto unlock;
3845                         }
3846
3847                         data_changed = 1;
3848                 }
3849                 cpos += num_clusters;
3850         }
3851
3852 attach_xattr:
3853         if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
3854                 ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
3855                                                        &ref_tree->rf_ci,
3856                                                        ref_root_bh,
3857                                                        &dealloc);
3858                 if (ret) {
3859                         mlog_errno(ret);
3860                         goto unlock;
3861                 }
3862         }
3863
3864         if (data_changed) {
3865                 ret = ocfs2_change_ctime(inode, di_bh);
3866                 if (ret)
3867                         mlog_errno(ret);
3868         }
3869
3870 unlock:
3871         ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3872         brelse(ref_root_bh);
3873
3874         if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
3875                 ocfs2_schedule_truncate_log_flush(osb, 1);
3876                 ocfs2_run_deallocs(osb, &dealloc);
3877         }
3878 out:
3879         /*
3880          * Empty the extent map so that we may get the right extent
3881          * record from the disk.
3882          */
3883         ocfs2_extent_map_trunc(inode, 0);
3884
3885         return ret;
3886 }
3887
3888 static int ocfs2_add_refcounted_extent(struct inode *inode,
3889                                    struct ocfs2_extent_tree *et,
3890                                    struct ocfs2_caching_info *ref_ci,
3891                                    struct buffer_head *ref_root_bh,
3892                                    u32 cpos, u32 p_cluster, u32 num_clusters,
3893                                    unsigned int ext_flags,
3894                                    struct ocfs2_cached_dealloc_ctxt *dealloc)
3895 {
3896         int ret;
3897         handle_t *handle;
3898         int credits = 0;
3899         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3900         struct ocfs2_alloc_context *meta_ac = NULL;
3901
3902         ret = ocfs2_lock_refcount_allocators(inode->i_sb,
3903                                              p_cluster, num_clusters,
3904                                              et, ref_ci,
3905                                              ref_root_bh, &meta_ac,
3906                                              NULL, &credits);
3907         if (ret) {
3908                 mlog_errno(ret);
3909                 goto out;
3910         }
3911
3912         handle = ocfs2_start_trans(osb, credits);
3913         if (IS_ERR(handle)) {
3914                 ret = PTR_ERR(handle);
3915                 mlog_errno(ret);
3916                 goto out;
3917         }
3918
3919         ret = ocfs2_insert_extent(handle, et, cpos,
3920                         ocfs2_clusters_to_blocks(inode->i_sb, p_cluster),
3921                         num_clusters, ext_flags, meta_ac);
3922         if (ret) {
3923                 mlog_errno(ret);
3924                 goto out_commit;
3925         }
3926
3927         ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
3928                                       p_cluster, num_clusters,
3929                                       meta_ac, dealloc);
3930         if (ret)
3931                 mlog_errno(ret);
3932
3933 out_commit:
3934         ocfs2_commit_trans(osb, handle);
3935 out:
3936         if (meta_ac)
3937                 ocfs2_free_alloc_context(meta_ac);
3938         return ret;
3939 }
3940
3941 static int ocfs2_duplicate_inline_data(struct inode *s_inode,
3942                                        struct buffer_head *s_bh,
3943                                        struct inode *t_inode,
3944                                        struct buffer_head *t_bh)
3945 {
3946         int ret;
3947         handle_t *handle;
3948         struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
3949         struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
3950         struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data;
3951
3952         BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
3953
3954         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
3955         if (IS_ERR(handle)) {
3956                 ret = PTR_ERR(handle);
3957                 mlog_errno(ret);
3958                 goto out;
3959         }
3960
3961         ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
3962                                       OCFS2_JOURNAL_ACCESS_WRITE);
3963         if (ret) {
3964                 mlog_errno(ret);
3965                 goto out_commit;
3966         }
3967
3968         t_di->id2.i_data.id_count = s_di->id2.i_data.id_count;
3969         memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data,
3970                le16_to_cpu(s_di->id2.i_data.id_count));
3971         spin_lock(&OCFS2_I(t_inode)->ip_lock);
3972         OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
3973         t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features);
3974         spin_unlock(&OCFS2_I(t_inode)->ip_lock);
3975
3976         ocfs2_journal_dirty(handle, t_bh);
3977
3978 out_commit:
3979         ocfs2_commit_trans(osb, handle);
3980 out:
3981         return ret;
3982 }
3983
3984 static int ocfs2_duplicate_extent_list(struct inode *s_inode,
3985                                 struct inode *t_inode,
3986                                 struct buffer_head *t_bh,
3987                                 struct ocfs2_caching_info *ref_ci,
3988                                 struct buffer_head *ref_root_bh,
3989                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
3990 {
3991         int ret = 0;
3992         u32 p_cluster, num_clusters, clusters, cpos;
3993         loff_t size;
3994         unsigned int ext_flags;
3995         struct ocfs2_extent_tree et;
3996
3997         ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh);
3998
3999         size = i_size_read(s_inode);
4000         clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size);
4001
4002         cpos = 0;
4003         while (cpos < clusters) {
4004                 ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
4005                                          &num_clusters, &ext_flags);
4006
4007                 if (p_cluster) {
4008                         ret = ocfs2_add_refcounted_extent(t_inode, &et,
4009                                                           ref_ci, ref_root_bh,
4010                                                           cpos, p_cluster,
4011                                                           num_clusters,
4012                                                           ext_flags,
4013                                                           dealloc);
4014                         if (ret) {
4015                                 mlog_errno(ret);
4016                                 goto out;
4017                         }
4018                 }
4019
4020                 cpos += num_clusters;
4021         }
4022
4023 out:
4024         return ret;
4025 }
4026
4027 /*
4028  * change the new file's attributes to the src.
4029  *
4030  * reflink creates a snapshot of a file, that means the attributes
4031  * must be identical except for three exceptions - nlink, ino, and ctime.
4032  */
4033 static int ocfs2_complete_reflink(struct inode *s_inode,
4034                                   struct buffer_head *s_bh,
4035                                   struct inode *t_inode,
4036                                   struct buffer_head *t_bh,
4037                                   bool preserve)
4038 {
4039         int ret;
4040         handle_t *handle;
4041         struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
4042         struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data;
4043         loff_t size = i_size_read(s_inode);
4044
4045         handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb),
4046                                    OCFS2_INODE_UPDATE_CREDITS);
4047         if (IS_ERR(handle)) {
4048                 ret = PTR_ERR(handle);
4049                 mlog_errno(ret);
4050                 return ret;
4051         }
4052
4053         ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
4054                                       OCFS2_JOURNAL_ACCESS_WRITE);
4055         if (ret) {
4056                 mlog_errno(ret);
4057                 goto out_commit;
4058         }
4059
4060         spin_lock(&OCFS2_I(t_inode)->ip_lock);
4061         OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters;
4062         OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr;
4063         OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
4064         spin_unlock(&OCFS2_I(t_inode)->ip_lock);
4065         i_size_write(t_inode, size);
4066         t_inode->i_blocks = s_inode->i_blocks;
4067
4068         di->i_xattr_inline_size = s_di->i_xattr_inline_size;
4069         di->i_clusters = s_di->i_clusters;
4070         di->i_size = s_di->i_size;
4071         di->i_dyn_features = s_di->i_dyn_features;
4072         di->i_attr = s_di->i_attr;
4073
4074         if (preserve) {
4075                 di->i_uid = s_di->i_uid;
4076                 di->i_gid = s_di->i_gid;
4077                 di->i_mode = s_di->i_mode;
4078
4079                 /*
4080                  * update time.
4081                  * we want mtime to appear identical to the source and
4082                  * update ctime.
4083                  */
4084                 t_inode->i_ctime = CURRENT_TIME;
4085
4086                 di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
4087                 di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
4088
4089                 t_inode->i_mtime = s_inode->i_mtime;
4090                 di->i_mtime = s_di->i_mtime;
4091                 di->i_mtime_nsec = s_di->i_mtime_nsec;
4092         }
4093
4094         ocfs2_journal_dirty(handle, t_bh);
4095
4096 out_commit:
4097         ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle);
4098         return ret;
4099 }
4100
4101 static int ocfs2_create_reflink_node(struct inode *s_inode,
4102                                      struct buffer_head *s_bh,
4103                                      struct inode *t_inode,
4104                                      struct buffer_head *t_bh,
4105                                      bool preserve)
4106 {
4107         int ret;
4108         struct buffer_head *ref_root_bh = NULL;
4109         struct ocfs2_cached_dealloc_ctxt dealloc;
4110         struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
4111         struct ocfs2_refcount_block *rb;
4112         struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
4113         struct ocfs2_refcount_tree *ref_tree;
4114
4115         ocfs2_init_dealloc_ctxt(&dealloc);
4116
4117         ret = ocfs2_set_refcount_tree(t_inode, t_bh,
4118                                       le64_to_cpu(di->i_refcount_loc));
4119         if (ret) {
4120                 mlog_errno(ret);
4121                 goto out;
4122         }
4123
4124         if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
4125                 ret = ocfs2_duplicate_inline_data(s_inode, s_bh,
4126                                                   t_inode, t_bh);
4127                 if (ret)
4128                         mlog_errno(ret);
4129                 goto out;
4130         }
4131
4132         ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
4133                                        1, &ref_tree, &ref_root_bh);
4134         if (ret) {
4135                 mlog_errno(ret);
4136                 goto out;
4137         }
4138         rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
4139
4140         ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
4141                                           &ref_tree->rf_ci, ref_root_bh,
4142                                           &dealloc);
4143         if (ret) {
4144                 mlog_errno(ret);
4145                 goto out_unlock_refcount;
4146         }
4147
4148 out_unlock_refcount:
4149         ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
4150         brelse(ref_root_bh);
4151 out:
4152         if (ocfs2_dealloc_has_cluster(&dealloc)) {
4153                 ocfs2_schedule_truncate_log_flush(osb, 1);
4154                 ocfs2_run_deallocs(osb, &dealloc);
4155         }
4156
4157         return ret;
4158 }
4159
4160 static int __ocfs2_reflink(struct dentry *old_dentry,
4161                            struct buffer_head *old_bh,
4162                            struct inode *new_inode,
4163                            bool preserve)
4164 {
4165         int ret;
4166         struct inode *inode = old_dentry->d_inode;
4167         struct buffer_head *new_bh = NULL;
4168
4169         ret = filemap_fdatawrite(inode->i_mapping);
4170         if (ret) {
4171                 mlog_errno(ret);
4172                 goto out;
4173         }
4174
4175         ret = ocfs2_attach_refcount_tree(inode, old_bh);
4176         if (ret) {
4177                 mlog_errno(ret);
4178                 goto out;
4179         }
4180
4181         mutex_lock(&new_inode->i_mutex);
4182         ret = ocfs2_inode_lock(new_inode, &new_bh, 1);
4183         if (ret) {
4184                 mlog_errno(ret);
4185                 goto out_unlock;
4186         }
4187
4188         ret = ocfs2_create_reflink_node(inode, old_bh,
4189                                         new_inode, new_bh, preserve);
4190         if (ret) {
4191                 mlog_errno(ret);
4192                 goto inode_unlock;
4193         }
4194
4195         if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
4196                 ret = ocfs2_reflink_xattrs(inode, old_bh,
4197                                            new_inode, new_bh,
4198                                            preserve);
4199                 if (ret) {
4200                         mlog_errno(ret);
4201                         goto inode_unlock;
4202                 }
4203         }
4204
4205         ret = ocfs2_complete_reflink(inode, old_bh,
4206                                      new_inode, new_bh, preserve);
4207         if (ret)
4208                 mlog_errno(ret);
4209
4210 inode_unlock:
4211         ocfs2_inode_unlock(new_inode, 1);
4212         brelse(new_bh);
4213 out_unlock:
4214         mutex_unlock(&new_inode->i_mutex);
4215 out:
4216         if (!ret) {
4217                 ret = filemap_fdatawait(inode->i_mapping);
4218                 if (ret)
4219                         mlog_errno(ret);
4220         }
4221         return ret;
4222 }
4223
4224 static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
4225                          struct dentry *new_dentry, bool preserve)
4226 {
4227         int error;
4228         struct inode *inode = old_dentry->d_inode;
4229         struct buffer_head *old_bh = NULL;
4230         struct inode *new_orphan_inode = NULL;
4231
4232         if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4233                 return -EOPNOTSUPP;
4234
4235         error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
4236                                              &new_orphan_inode);
4237         if (error) {
4238                 mlog_errno(error);
4239                 goto out;
4240         }
4241
4242         error = ocfs2_inode_lock(inode, &old_bh, 1);
4243         if (error) {
4244                 mlog_errno(error);
4245                 goto out;
4246         }
4247
4248         down_write(&OCFS2_I(inode)->ip_xattr_sem);
4249         down_write(&OCFS2_I(inode)->ip_alloc_sem);
4250         error = __ocfs2_reflink(old_dentry, old_bh,
4251                                 new_orphan_inode, preserve);
4252         up_write(&OCFS2_I(inode)->ip_alloc_sem);
4253         up_write(&OCFS2_I(inode)->ip_xattr_sem);
4254
4255         ocfs2_inode_unlock(inode, 1);
4256         brelse(old_bh);
4257
4258         if (error) {
4259                 mlog_errno(error);
4260                 goto out;
4261         }
4262
4263         /* If the security isn't preserved, we need to re-initialize them. */
4264         if (!preserve) {
4265                 error = ocfs2_init_security_and_acl(dir, new_orphan_inode);
4266                 if (error)
4267                         mlog_errno(error);
4268         }
4269 out:
4270         if (!error) {
4271                 error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
4272                                                        new_dentry);
4273                 if (error)
4274                         mlog_errno(error);
4275         }
4276
4277         if (new_orphan_inode) {
4278                 /*
4279                  * We need to open_unlock the inode no matter whether we
4280                  * succeed or not, so that other nodes can delete it later.
4281                  */
4282                 ocfs2_open_unlock(new_orphan_inode);
4283                 if (error)
4284                         iput(new_orphan_inode);
4285         }
4286
4287         return error;
4288 }
4289
4290 /*
4291  * Below here are the bits used by OCFS2_IOC_REFLINK() to fake
4292  * sys_reflink().  This will go away when vfs_reflink() exists in
4293  * fs/namei.c.
4294  */
4295
4296 /* copied from may_create in VFS. */
4297 static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
4298 {
4299         if (child->d_inode)
4300                 return -EEXIST;
4301         if (IS_DEADDIR(dir))
4302                 return -ENOENT;
4303         return inode_permission(dir, MAY_WRITE | MAY_EXEC);
4304 }
4305
4306 /* copied from user_path_parent. */
4307 static int ocfs2_user_path_parent(const char __user *path,
4308                                   struct nameidata *nd, char **name)
4309 {
4310         char *s = getname(path);
4311         int error;
4312
4313         if (IS_ERR(s))
4314                 return PTR_ERR(s);
4315
4316         error = path_lookup(s, LOOKUP_PARENT, nd);
4317         if (error)
4318                 putname(s);
4319         else
4320                 *name = s;
4321
4322         return error;
4323 }
4324
4325 /**
4326  * ocfs2_vfs_reflink - Create a reference-counted link
4327  *
4328  * @old_dentry:        source dentry + inode
4329  * @dir:       directory to create the target
4330  * @new_dentry:        target dentry
4331  * @preserve:  if true, preserve all file attributes
4332  */
4333 static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
4334                              struct dentry *new_dentry, bool preserve)
4335 {
4336         struct inode *inode = old_dentry->d_inode;
4337         int error;
4338
4339         if (!inode)
4340                 return -ENOENT;
4341
4342         error = ocfs2_may_create(dir, new_dentry);
4343         if (error)
4344                 return error;
4345
4346         if (dir->i_sb != inode->i_sb)
4347                 return -EXDEV;
4348
4349         /*
4350          * A reflink to an append-only or immutable file cannot be created.
4351          */
4352         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4353                 return -EPERM;
4354
4355         /* Only regular files can be reflinked. */
4356         if (!S_ISREG(inode->i_mode))
4357                 return -EPERM;
4358
4359         /*
4360          * If the caller wants to preserve ownership, they require the
4361          * rights to do so.
4362          */
4363         if (preserve) {
4364                 if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN))
4365                         return -EPERM;
4366                 if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
4367                         return -EPERM;
4368         }
4369
4370         /*
4371          * If the caller is modifying any aspect of the attributes, they
4372          * are not creating a snapshot.  They need read permission on the
4373          * file.
4374          */
4375         if (!preserve) {
4376                 error = inode_permission(inode, MAY_READ);
4377                 if (error)
4378                         return error;
4379         }
4380
4381         mutex_lock(&inode->i_mutex);
4382         dquot_initialize(dir);
4383         error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
4384         mutex_unlock(&inode->i_mutex);
4385         if (!error)
4386                 fsnotify_create(dir, new_dentry);
4387         return error;
4388 }
4389 /*
4390  * Most codes are copied from sys_linkat.
4391  */
4392 int ocfs2_reflink_ioctl(struct inode *inode,
4393                         const char __user *oldname,
4394                         const char __user *newname,
4395                         bool preserve)
4396 {
4397         struct dentry *new_dentry;
4398         struct nameidata nd;
4399         struct path old_path;
4400         int error;
4401         char *to = NULL;
4402
4403         if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4404                 return -EOPNOTSUPP;
4405
4406         error = user_path_at(AT_FDCWD, oldname, 0, &old_path);
4407         if (error) {
4408                 mlog_errno(error);
4409                 return error;
4410         }
4411
4412         error = ocfs2_user_path_parent(newname, &nd, &to);
4413         if (error) {
4414                 mlog_errno(error);
4415                 goto out;
4416         }
4417
4418         error = -EXDEV;
4419         if (old_path.mnt != nd.path.mnt)
4420                 goto out_release;
4421         new_dentry = lookup_create(&nd, 0);
4422         error = PTR_ERR(new_dentry);
4423         if (IS_ERR(new_dentry)) {
4424                 mlog_errno(error);
4425                 goto out_unlock;
4426         }
4427
4428         error = mnt_want_write(nd.path.mnt);
4429         if (error) {
4430                 mlog_errno(error);
4431                 goto out_dput;
4432         }
4433
4434         error = ocfs2_vfs_reflink(old_path.dentry,
4435                                   nd.path.dentry->d_inode,
4436                                   new_dentry, preserve);
4437         mnt_drop_write(nd.path.mnt);
4438 out_dput:
4439         dput(new_dentry);
4440 out_unlock:
4441         mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
4442 out_release:
4443         path_put(&nd.path);
4444         putname(to);
4445 out:
4446         path_put(&old_path);
4447
4448         return error;
4449 }