d15638529389624842ade0c2f610fb37309b4a42
[safe/jmp/linux-2.6] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/version.h>
23 #include "compat.h"
24 #include "hash.h"
25 #include "crc32c.h"
26 #include "ctree.h"
27 #include "disk-io.h"
28 #include "print-tree.h"
29 #include "transaction.h"
30 #include "volumes.h"
31 #include "locking.h"
32 #include "ref-cache.h"
33 #include "compat.h"
34
35 #define PENDING_EXTENT_INSERT 0
36 #define PENDING_EXTENT_DELETE 1
37 #define PENDING_BACKREF_UPDATE 2
38
39 struct pending_extent_op {
40         int type;
41         u64 bytenr;
42         u64 num_bytes;
43         u64 parent;
44         u64 orig_parent;
45         u64 generation;
46         u64 orig_generation;
47         int level;
48         struct list_head list;
49         int del;
50 };
51
52 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
53                                  btrfs_root *extent_root, int all);
54 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
55                                btrfs_root *extent_root, int all);
56 static struct btrfs_block_group_cache *
57 __btrfs_find_block_group(struct btrfs_root *root,
58                          struct btrfs_block_group_cache *hint,
59                          u64 search_start, int data, int owner);
60 static int pin_down_bytes(struct btrfs_trans_handle *trans,
61                           struct btrfs_root *root,
62                           u64 bytenr, u64 num_bytes, int is_data);
63 static int update_block_group(struct btrfs_trans_handle *trans,
64                               struct btrfs_root *root,
65                               u64 bytenr, u64 num_bytes, int alloc,
66                               int mark_free);
67
68 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
69 {
70         return (cache->flags & bits) == bits;
71 }
72
73 /*
74  * this adds the block group to the fs_info rb tree for the block group
75  * cache
76  */
77 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
78                                 struct btrfs_block_group_cache *block_group)
79 {
80         struct rb_node **p;
81         struct rb_node *parent = NULL;
82         struct btrfs_block_group_cache *cache;
83
84         spin_lock(&info->block_group_cache_lock);
85         p = &info->block_group_cache_tree.rb_node;
86
87         while (*p) {
88                 parent = *p;
89                 cache = rb_entry(parent, struct btrfs_block_group_cache,
90                                  cache_node);
91                 if (block_group->key.objectid < cache->key.objectid) {
92                         p = &(*p)->rb_left;
93                 } else if (block_group->key.objectid > cache->key.objectid) {
94                         p = &(*p)->rb_right;
95                 } else {
96                         spin_unlock(&info->block_group_cache_lock);
97                         return -EEXIST;
98                 }
99         }
100
101         rb_link_node(&block_group->cache_node, parent, p);
102         rb_insert_color(&block_group->cache_node,
103                         &info->block_group_cache_tree);
104         spin_unlock(&info->block_group_cache_lock);
105
106         return 0;
107 }
108
109 /*
110  * This will return the block group at or after bytenr if contains is 0, else
111  * it will return the block group that contains the bytenr
112  */
113 static struct btrfs_block_group_cache *
114 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
115                               int contains)
116 {
117         struct btrfs_block_group_cache *cache, *ret = NULL;
118         struct rb_node *n;
119         u64 end, start;
120
121         spin_lock(&info->block_group_cache_lock);
122         n = info->block_group_cache_tree.rb_node;
123
124         while (n) {
125                 cache = rb_entry(n, struct btrfs_block_group_cache,
126                                  cache_node);
127                 end = cache->key.objectid + cache->key.offset - 1;
128                 start = cache->key.objectid;
129
130                 if (bytenr < start) {
131                         if (!contains && (!ret || start < ret->key.objectid))
132                                 ret = cache;
133                         n = n->rb_left;
134                 } else if (bytenr > start) {
135                         if (contains && bytenr <= end) {
136                                 ret = cache;
137                                 break;
138                         }
139                         n = n->rb_right;
140                 } else {
141                         ret = cache;
142                         break;
143                 }
144         }
145         spin_unlock(&info->block_group_cache_lock);
146
147         return ret;
148 }
149
150 /*
151  * this is only called by cache_block_group, since we could have freed extents
152  * we need to check the pinned_extents for any extents that can't be used yet
153  * since their free space will be released as soon as the transaction commits.
154  */
155 static int add_new_free_space(struct btrfs_block_group_cache *block_group,
156                               struct btrfs_fs_info *info, u64 start, u64 end)
157 {
158         u64 extent_start, extent_end, size;
159         int ret;
160
161         mutex_lock(&info->pinned_mutex);
162         while (start < end) {
163                 ret = find_first_extent_bit(&info->pinned_extents, start,
164                                             &extent_start, &extent_end,
165                                             EXTENT_DIRTY);
166                 if (ret)
167                         break;
168
169                 if (extent_start == start) {
170                         start = extent_end + 1;
171                 } else if (extent_start > start && extent_start < end) {
172                         size = extent_start - start;
173                         ret = btrfs_add_free_space(block_group, start,
174                                                    size);
175                         BUG_ON(ret);
176                         start = extent_end + 1;
177                 } else {
178                         break;
179                 }
180         }
181
182         if (start < end) {
183                 size = end - start;
184                 ret = btrfs_add_free_space(block_group, start, size);
185                 BUG_ON(ret);
186         }
187         mutex_unlock(&info->pinned_mutex);
188
189         return 0;
190 }
191
192 static int cache_block_group(struct btrfs_root *root,
193                              struct btrfs_block_group_cache *block_group)
194 {
195         struct btrfs_path *path;
196         int ret = 0;
197         struct btrfs_key key;
198         struct extent_buffer *leaf;
199         int slot;
200         u64 last = 0;
201         u64 first_free;
202         int found = 0;
203
204         if (!block_group)
205                 return 0;
206
207         root = root->fs_info->extent_root;
208
209         if (block_group->cached)
210                 return 0;
211
212         path = btrfs_alloc_path();
213         if (!path)
214                 return -ENOMEM;
215
216         path->reada = 2;
217         /*
218          * we get into deadlocks with paths held by callers of this function.
219          * since the alloc_mutex is protecting things right now, just
220          * skip the locking here
221          */
222         path->skip_locking = 1;
223         first_free = max_t(u64, block_group->key.objectid,
224                            BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
225         key.objectid = block_group->key.objectid;
226         key.offset = 0;
227         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
228         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
229         if (ret < 0)
230                 goto err;
231         ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
232         if (ret < 0)
233                 goto err;
234         if (ret == 0) {
235                 leaf = path->nodes[0];
236                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
237                 if (key.objectid + key.offset > first_free)
238                         first_free = key.objectid + key.offset;
239         }
240         while(1) {
241                 leaf = path->nodes[0];
242                 slot = path->slots[0];
243                 if (slot >= btrfs_header_nritems(leaf)) {
244                         ret = btrfs_next_leaf(root, path);
245                         if (ret < 0)
246                                 goto err;
247                         if (ret == 0)
248                                 continue;
249                         else
250                                 break;
251                 }
252                 btrfs_item_key_to_cpu(leaf, &key, slot);
253                 if (key.objectid < block_group->key.objectid)
254                         goto next;
255
256                 if (key.objectid >= block_group->key.objectid +
257                     block_group->key.offset)
258                         break;
259
260                 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
261                         if (!found) {
262                                 last = first_free;
263                                 found = 1;
264                         }
265
266                         add_new_free_space(block_group, root->fs_info, last,
267                                            key.objectid);
268
269                         last = key.objectid + key.offset;
270                 }
271 next:
272                 path->slots[0]++;
273         }
274
275         if (!found)
276                 last = first_free;
277
278         add_new_free_space(block_group, root->fs_info, last,
279                            block_group->key.objectid +
280                            block_group->key.offset);
281
282         block_group->cached = 1;
283         ret = 0;
284 err:
285         btrfs_free_path(path);
286         return ret;
287 }
288
289 /*
290  * return the block group that starts at or after bytenr
291  */
292 static struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
293                                                        btrfs_fs_info *info,
294                                                          u64 bytenr)
295 {
296         struct btrfs_block_group_cache *cache;
297
298         cache = block_group_cache_tree_search(info, bytenr, 0);
299
300         return cache;
301 }
302
303 /*
304  * return the block group that contains teh given bytenr
305  */
306 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
307                                                          btrfs_fs_info *info,
308                                                          u64 bytenr)
309 {
310         struct btrfs_block_group_cache *cache;
311
312         cache = block_group_cache_tree_search(info, bytenr, 1);
313
314         return cache;
315 }
316
317 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
318                                                   u64 flags)
319 {
320         struct list_head *head = &info->space_info;
321         struct list_head *cur;
322         struct btrfs_space_info *found;
323         list_for_each(cur, head) {
324                 found = list_entry(cur, struct btrfs_space_info, list);
325                 if (found->flags == flags)
326                         return found;
327         }
328         return NULL;
329 }
330
331 static u64 div_factor(u64 num, int factor)
332 {
333         if (factor == 10)
334                 return num;
335         num *= factor;
336         do_div(num, 10);
337         return num;
338 }
339
340 static struct btrfs_block_group_cache *
341 __btrfs_find_block_group(struct btrfs_root *root,
342                          struct btrfs_block_group_cache *hint,
343                          u64 search_start, int data, int owner)
344 {
345         struct btrfs_block_group_cache *cache;
346         struct btrfs_block_group_cache *found_group = NULL;
347         struct btrfs_fs_info *info = root->fs_info;
348         u64 used;
349         u64 last = 0;
350         u64 free_check;
351         int full_search = 0;
352         int factor = 10;
353         int wrapped = 0;
354
355         if (data & BTRFS_BLOCK_GROUP_METADATA)
356                 factor = 9;
357
358         if (search_start) {
359                 struct btrfs_block_group_cache *shint;
360                 shint = btrfs_lookup_first_block_group(info, search_start);
361                 if (shint && block_group_bits(shint, data)) {
362                         spin_lock(&shint->lock);
363                         used = btrfs_block_group_used(&shint->item);
364                         if (used + shint->pinned + shint->reserved <
365                             div_factor(shint->key.offset, factor)) {
366                                 spin_unlock(&shint->lock);
367                                 return shint;
368                         }
369                         spin_unlock(&shint->lock);
370                 }
371         }
372         if (hint && block_group_bits(hint, data)) {
373                 spin_lock(&hint->lock);
374                 used = btrfs_block_group_used(&hint->item);
375                 if (used + hint->pinned + hint->reserved <
376                     div_factor(hint->key.offset, factor)) {
377                         spin_unlock(&hint->lock);
378                         return hint;
379                 }
380                 spin_unlock(&hint->lock);
381                 last = hint->key.objectid + hint->key.offset;
382         } else {
383                 if (hint)
384                         last = max(hint->key.objectid, search_start);
385                 else
386                         last = search_start;
387         }
388 again:
389         while (1) {
390                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
391                 if (!cache)
392                         break;
393
394                 spin_lock(&cache->lock);
395                 last = cache->key.objectid + cache->key.offset;
396                 used = btrfs_block_group_used(&cache->item);
397
398                 if (block_group_bits(cache, data)) {
399                         free_check = div_factor(cache->key.offset, factor);
400                         if (used + cache->pinned + cache->reserved <
401                             free_check) {
402                                 found_group = cache;
403                                 spin_unlock(&cache->lock);
404                                 goto found;
405                         }
406                 }
407                 spin_unlock(&cache->lock);
408                 cond_resched();
409         }
410         if (!wrapped) {
411                 last = search_start;
412                 wrapped = 1;
413                 goto again;
414         }
415         if (!full_search && factor < 10) {
416                 last = search_start;
417                 full_search = 1;
418                 factor = 10;
419                 goto again;
420         }
421 found:
422         return found_group;
423 }
424
425 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
426                                                  struct btrfs_block_group_cache
427                                                  *hint, u64 search_start,
428                                                  int data, int owner)
429 {
430
431         struct btrfs_block_group_cache *ret;
432         ret = __btrfs_find_block_group(root, hint, search_start, data, owner);
433         return ret;
434 }
435
436 /* simple helper to search for an existing extent at a given offset */
437 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
438 {
439         int ret;
440         struct btrfs_key key;
441         struct btrfs_path *path;
442
443         path = btrfs_alloc_path();
444         BUG_ON(!path);
445         key.objectid = start;
446         key.offset = len;
447         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
448         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
449                                 0, 0);
450         btrfs_free_path(path);
451         return ret;
452 }
453
454 /*
455  * Back reference rules.  Back refs have three main goals:
456  *
457  * 1) differentiate between all holders of references to an extent so that
458  *    when a reference is dropped we can make sure it was a valid reference
459  *    before freeing the extent.
460  *
461  * 2) Provide enough information to quickly find the holders of an extent
462  *    if we notice a given block is corrupted or bad.
463  *
464  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
465  *    maintenance.  This is actually the same as #2, but with a slightly
466  *    different use case.
467  *
468  * File extents can be referenced by:
469  *
470  * - multiple snapshots, subvolumes, or different generations in one subvol
471  * - different files inside a single subvolume
472  * - different offsets inside a file (bookend extents in file.c)
473  *
474  * The extent ref structure has fields for:
475  *
476  * - Objectid of the subvolume root
477  * - Generation number of the tree holding the reference
478  * - objectid of the file holding the reference
479  * - number of references holding by parent node (alway 1 for tree blocks)
480  *
481  * Btree leaf may hold multiple references to a file extent. In most cases,
482  * these references are from same file and the corresponding offsets inside
483  * the file are close together.
484  *
485  * When a file extent is allocated the fields are filled in:
486  *     (root_key.objectid, trans->transid, inode objectid, 1)
487  *
488  * When a leaf is cow'd new references are added for every file extent found
489  * in the leaf.  It looks similar to the create case, but trans->transid will
490  * be different when the block is cow'd.
491  *
492  *     (root_key.objectid, trans->transid, inode objectid,
493  *      number of references in the leaf)
494  *
495  * When a file extent is removed either during snapshot deletion or
496  * file truncation, we find the corresponding back reference and check
497  * the following fields:
498  *
499  *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
500  *      inode objectid)
501  *
502  * Btree extents can be referenced by:
503  *
504  * - Different subvolumes
505  * - Different generations of the same subvolume
506  *
507  * When a tree block is created, back references are inserted:
508  *
509  * (root->root_key.objectid, trans->transid, level, 1)
510  *
511  * When a tree block is cow'd, new back references are added for all the
512  * blocks it points to. If the tree block isn't in reference counted root,
513  * the old back references are removed. These new back references are of
514  * the form (trans->transid will have increased since creation):
515  *
516  * (root->root_key.objectid, trans->transid, level, 1)
517  *
518  * When a backref is in deleting, the following fields are checked:
519  *
520  * if backref was for a tree root:
521  *     (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
522  * else
523  *     (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
524  *
525  * Back Reference Key composing:
526  *
527  * The key objectid corresponds to the first byte in the extent, the key
528  * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
529  * byte of parent extent. If a extent is tree root, the key offset is set
530  * to the key objectid.
531  */
532
533 static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
534                                           struct btrfs_root *root,
535                                           struct btrfs_path *path,
536                                           u64 bytenr, u64 parent,
537                                           u64 ref_root, u64 ref_generation,
538                                           u64 owner_objectid, int del)
539 {
540         struct btrfs_key key;
541         struct btrfs_extent_ref *ref;
542         struct extent_buffer *leaf;
543         u64 ref_objectid;
544         int ret;
545
546         key.objectid = bytenr;
547         key.type = BTRFS_EXTENT_REF_KEY;
548         key.offset = parent;
549
550         ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
551         if (ret < 0)
552                 goto out;
553         if (ret > 0) {
554                 ret = -ENOENT;
555                 goto out;
556         }
557
558         leaf = path->nodes[0];
559         ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
560         ref_objectid = btrfs_ref_objectid(leaf, ref);
561         if (btrfs_ref_root(leaf, ref) != ref_root ||
562             btrfs_ref_generation(leaf, ref) != ref_generation ||
563             (ref_objectid != owner_objectid &&
564              ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
565                 ret = -EIO;
566                 WARN_ON(1);
567                 goto out;
568         }
569         ret = 0;
570 out:
571         return ret;
572 }
573
574 /*
575  * updates all the backrefs that are pending on update_list for the
576  * extent_root
577  */
578 static int noinline update_backrefs(struct btrfs_trans_handle *trans,
579                                     struct btrfs_root *extent_root,
580                                     struct btrfs_path *path,
581                                     struct list_head *update_list)
582 {
583         struct btrfs_key key;
584         struct btrfs_extent_ref *ref;
585         struct btrfs_fs_info *info = extent_root->fs_info;
586         struct pending_extent_op *op;
587         struct extent_buffer *leaf;
588         int ret = 0;
589         struct list_head *cur = update_list->next;
590         u64 ref_objectid;
591         u64 ref_root = extent_root->root_key.objectid;
592
593         op = list_entry(cur, struct pending_extent_op, list);
594
595 search:
596         key.objectid = op->bytenr;
597         key.type = BTRFS_EXTENT_REF_KEY;
598         key.offset = op->orig_parent;
599
600         ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
601         BUG_ON(ret);
602
603         leaf = path->nodes[0];
604
605 loop:
606         ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
607
608         ref_objectid = btrfs_ref_objectid(leaf, ref);
609
610         if (btrfs_ref_root(leaf, ref) != ref_root ||
611             btrfs_ref_generation(leaf, ref) != op->orig_generation ||
612             (ref_objectid != op->level &&
613              ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
614                 printk(KERN_ERR "couldn't find %Lu, parent %Lu, root %Lu, "
615                        "owner %u\n", op->bytenr, op->orig_parent,
616                        ref_root, op->level);
617                 btrfs_print_leaf(extent_root, leaf);
618                 BUG();
619         }
620
621         key.objectid = op->bytenr;
622         key.offset = op->parent;
623         key.type = BTRFS_EXTENT_REF_KEY;
624         ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
625         BUG_ON(ret);
626         ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
627         btrfs_set_ref_generation(leaf, ref, op->generation);
628
629         cur = cur->next;
630
631         list_del_init(&op->list);
632         unlock_extent(&info->extent_ins, op->bytenr,
633                       op->bytenr + op->num_bytes - 1, GFP_NOFS);
634         kfree(op);
635
636         if (cur == update_list) {
637                 btrfs_mark_buffer_dirty(path->nodes[0]);
638                 btrfs_release_path(extent_root, path);
639                 goto out;
640         }
641
642         op = list_entry(cur, struct pending_extent_op, list);
643
644         path->slots[0]++;
645         while (path->slots[0] < btrfs_header_nritems(leaf)) {
646                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
647                 if (key.objectid == op->bytenr &&
648                     key.type == BTRFS_EXTENT_REF_KEY)
649                         goto loop;
650                 path->slots[0]++;
651         }
652
653         btrfs_mark_buffer_dirty(path->nodes[0]);
654         btrfs_release_path(extent_root, path);
655         goto search;
656
657 out:
658         return 0;
659 }
660
661 static int noinline insert_extents(struct btrfs_trans_handle *trans,
662                                    struct btrfs_root *extent_root,
663                                    struct btrfs_path *path,
664                                    struct list_head *insert_list, int nr)
665 {
666         struct btrfs_key *keys;
667         u32 *data_size;
668         struct pending_extent_op *op;
669         struct extent_buffer *leaf;
670         struct list_head *cur = insert_list->next;
671         struct btrfs_fs_info *info = extent_root->fs_info;
672         u64 ref_root = extent_root->root_key.objectid;
673         int i = 0, last = 0, ret;
674         int total = nr * 2;
675
676         if (!nr)
677                 return 0;
678
679         keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
680         if (!keys)
681                 return -ENOMEM;
682
683         data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
684         if (!data_size) {
685                 kfree(keys);
686                 return -ENOMEM;
687         }
688
689         list_for_each_entry(op, insert_list, list) {
690                 keys[i].objectid = op->bytenr;
691                 keys[i].offset = op->num_bytes;
692                 keys[i].type = BTRFS_EXTENT_ITEM_KEY;
693                 data_size[i] = sizeof(struct btrfs_extent_item);
694                 i++;
695
696                 keys[i].objectid = op->bytenr;
697                 keys[i].offset = op->parent;
698                 keys[i].type = BTRFS_EXTENT_REF_KEY;
699                 data_size[i] = sizeof(struct btrfs_extent_ref);
700                 i++;
701         }
702
703         op = list_entry(cur, struct pending_extent_op, list);
704         i = 0;
705         while (i < total) {
706                 int c;
707                 ret = btrfs_insert_some_items(trans, extent_root, path,
708                                               keys+i, data_size+i, total-i);
709                 BUG_ON(ret < 0);
710
711                 if (last && ret > 1)
712                         BUG();
713
714                 leaf = path->nodes[0];
715                 for (c = 0; c < ret; c++) {
716                         int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
717
718                         /*
719                          * if the first item we inserted was a backref, then
720                          * the EXTENT_ITEM will be the odd c's, else it will
721                          * be the even c's
722                          */
723                         if ((ref_first && (c % 2)) ||
724                             (!ref_first && !(c % 2))) {
725                                 struct btrfs_extent_item *itm;
726
727                                 itm = btrfs_item_ptr(leaf, path->slots[0] + c,
728                                                      struct btrfs_extent_item);
729                                 btrfs_set_extent_refs(path->nodes[0], itm, 1);
730                                 op->del++;
731                         } else {
732                                 struct btrfs_extent_ref *ref;
733
734                                 ref = btrfs_item_ptr(leaf, path->slots[0] + c,
735                                                      struct btrfs_extent_ref);
736                                 btrfs_set_ref_root(leaf, ref, ref_root);
737                                 btrfs_set_ref_generation(leaf, ref,
738                                                          op->generation);
739                                 btrfs_set_ref_objectid(leaf, ref, op->level);
740                                 btrfs_set_ref_num_refs(leaf, ref, 1);
741                                 op->del++;
742                         }
743
744                         /*
745                          * using del to see when its ok to free up the
746                          * pending_extent_op.  In the case where we insert the
747                          * last item on the list in order to help do batching
748                          * we need to not free the extent op until we actually
749                          * insert the extent_item
750                          */
751                         if (op->del == 2) {
752                                 unlock_extent(&info->extent_ins, op->bytenr,
753                                               op->bytenr + op->num_bytes - 1,
754                                               GFP_NOFS);
755                                 cur = cur->next;
756                                 list_del_init(&op->list);
757                                 kfree(op);
758                                 if (cur != insert_list)
759                                         op = list_entry(cur,
760                                                 struct pending_extent_op,
761                                                 list);
762                         }
763                 }
764                 btrfs_mark_buffer_dirty(leaf);
765                 btrfs_release_path(extent_root, path);
766
767                 /*
768                  * Ok backref's and items usually go right next to eachother,
769                  * but if we could only insert 1 item that means that we
770                  * inserted on the end of a leaf, and we have no idea what may
771                  * be on the next leaf so we just play it safe.  In order to
772                  * try and help this case we insert the last thing on our
773                  * insert list so hopefully it will end up being the last
774                  * thing on the leaf and everything else will be before it,
775                  * which will let us insert a whole bunch of items at the same
776                  * time.
777                  */
778                 if (ret == 1 && !last && (i + ret < total)) {
779                         /*
780                          * last: where we will pick up the next time around
781                          * i: our current key to insert, will be total - 1
782                          * cur: the current op we are screwing with
783                          * op: duh
784                          */
785                         last = i + ret;
786                         i = total - 1;
787                         cur = insert_list->prev;
788                         op = list_entry(cur, struct pending_extent_op, list);
789                 } else if (last) {
790                         /*
791                          * ok we successfully inserted the last item on the
792                          * list, lets reset everything
793                          *
794                          * i: our current key to insert, so where we left off
795                          *    last time
796                          * last: done with this
797                          * cur: the op we are messing with
798                          * op: duh
799                          * total: since we inserted the last key, we need to
800                          *        decrement total so we dont overflow
801                          */
802                         i = last;
803                         last = 0;
804                         total--;
805                         if (i < total) {
806                                 cur = insert_list->next;
807                                 op = list_entry(cur, struct pending_extent_op,
808                                                 list);
809                         }
810                 } else {
811                         i += ret;
812                 }
813
814                 cond_resched();
815         }
816         ret = 0;
817         kfree(keys);
818         kfree(data_size);
819         return ret;
820 }
821
822 static int noinline insert_extent_backref(struct btrfs_trans_handle *trans,
823                                           struct btrfs_root *root,
824                                           struct btrfs_path *path,
825                                           u64 bytenr, u64 parent,
826                                           u64 ref_root, u64 ref_generation,
827                                           u64 owner_objectid)
828 {
829         struct btrfs_key key;
830         struct extent_buffer *leaf;
831         struct btrfs_extent_ref *ref;
832         u32 num_refs;
833         int ret;
834
835         key.objectid = bytenr;
836         key.type = BTRFS_EXTENT_REF_KEY;
837         key.offset = parent;
838
839         ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
840         if (ret == 0) {
841                 leaf = path->nodes[0];
842                 ref = btrfs_item_ptr(leaf, path->slots[0],
843                                      struct btrfs_extent_ref);
844                 btrfs_set_ref_root(leaf, ref, ref_root);
845                 btrfs_set_ref_generation(leaf, ref, ref_generation);
846                 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
847                 btrfs_set_ref_num_refs(leaf, ref, 1);
848         } else if (ret == -EEXIST) {
849                 u64 existing_owner;
850                 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
851                 leaf = path->nodes[0];
852                 ref = btrfs_item_ptr(leaf, path->slots[0],
853                                      struct btrfs_extent_ref);
854                 if (btrfs_ref_root(leaf, ref) != ref_root ||
855                     btrfs_ref_generation(leaf, ref) != ref_generation) {
856                         ret = -EIO;
857                         WARN_ON(1);
858                         goto out;
859                 }
860
861                 num_refs = btrfs_ref_num_refs(leaf, ref);
862                 BUG_ON(num_refs == 0);
863                 btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
864
865                 existing_owner = btrfs_ref_objectid(leaf, ref);
866                 if (existing_owner != owner_objectid &&
867                     existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
868                         btrfs_set_ref_objectid(leaf, ref,
869                                         BTRFS_MULTIPLE_OBJECTIDS);
870                 }
871                 ret = 0;
872         } else {
873                 goto out;
874         }
875         btrfs_mark_buffer_dirty(path->nodes[0]);
876 out:
877         btrfs_release_path(root, path);
878         return ret;
879 }
880
881 static int noinline remove_extent_backref(struct btrfs_trans_handle *trans,
882                                           struct btrfs_root *root,
883                                           struct btrfs_path *path)
884 {
885         struct extent_buffer *leaf;
886         struct btrfs_extent_ref *ref;
887         u32 num_refs;
888         int ret = 0;
889
890         leaf = path->nodes[0];
891         ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
892         num_refs = btrfs_ref_num_refs(leaf, ref);
893         BUG_ON(num_refs == 0);
894         num_refs -= 1;
895         if (num_refs == 0) {
896                 ret = btrfs_del_item(trans, root, path);
897         } else {
898                 btrfs_set_ref_num_refs(leaf, ref, num_refs);
899                 btrfs_mark_buffer_dirty(leaf);
900         }
901         btrfs_release_path(root, path);
902         return ret;
903 }
904
905 #ifdef BIO_RW_DISCARD
906 static void btrfs_issue_discard(struct block_device *bdev,
907                                 u64 start, u64 len)
908 {
909 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28)
910         blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
911 #else
912         blkdev_issue_discard(bdev, start >> 9, len >> 9);
913 #endif
914 }
915 #endif
916
917 static int noinline free_extents(struct btrfs_trans_handle *trans,
918                                  struct btrfs_root *extent_root,
919                                  struct list_head *del_list)
920 {
921         struct btrfs_fs_info *info = extent_root->fs_info;
922         struct btrfs_path *path;
923         struct btrfs_key key, found_key;
924         struct extent_buffer *leaf;
925         struct list_head *cur;
926         struct pending_extent_op *op;
927         struct btrfs_extent_item *ei;
928         int ret, num_to_del, extent_slot = 0, found_extent = 0;
929         u32 refs;
930         u64 bytes_freed = 0;
931
932         path = btrfs_alloc_path();
933         if (!path)
934                 return -ENOMEM;
935         path->reada = 1;
936
937 search:
938         /* search for the backref for the current ref we want to delete */
939         cur = del_list->next;
940         op = list_entry(cur, struct pending_extent_op, list);
941         ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
942                                     op->orig_parent,
943                                     extent_root->root_key.objectid,
944                                     op->orig_generation, op->level, 1);
945         if (ret) {
946                 printk("Unable to find backref byte nr %Lu root %Lu gen %Lu "
947                        "owner %u\n", op->bytenr,
948                        extent_root->root_key.objectid, op->orig_generation,
949                        op->level);
950                 btrfs_print_leaf(extent_root, path->nodes[0]);
951                 WARN_ON(1);
952                 goto out;
953         }
954
955         extent_slot = path->slots[0];
956         num_to_del = 1;
957         found_extent = 0;
958
959         /*
960          * if we aren't the first item on the leaf we can move back one and see
961          * if our ref is right next to our extent item
962          */
963         if (likely(extent_slot)) {
964                 extent_slot--;
965                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
966                                       extent_slot);
967                 if (found_key.objectid == op->bytenr &&
968                     found_key.type == BTRFS_EXTENT_ITEM_KEY &&
969                     found_key.offset == op->num_bytes) {
970                         num_to_del++;
971                         found_extent = 1;
972                 }
973         }
974
975         /*
976          * if we didn't find the extent we need to delete the backref and then
977          * search for the extent item key so we can update its ref count
978          */
979         if (!found_extent) {
980                 key.objectid = op->bytenr;
981                 key.type = BTRFS_EXTENT_ITEM_KEY;
982                 key.offset = op->num_bytes;
983
984                 ret = remove_extent_backref(trans, extent_root, path);
985                 BUG_ON(ret);
986                 btrfs_release_path(extent_root, path);
987                 ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
988                 BUG_ON(ret);
989                 extent_slot = path->slots[0];
990         }
991
992         /* this is where we update the ref count for the extent */
993         leaf = path->nodes[0];
994         ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
995         refs = btrfs_extent_refs(leaf, ei);
996         BUG_ON(refs == 0);
997         refs--;
998         btrfs_set_extent_refs(leaf, ei, refs);
999
1000         btrfs_mark_buffer_dirty(leaf);
1001
1002         /*
1003          * This extent needs deleting.  The reason cur_slot is extent_slot +
1004          * num_to_del is because extent_slot points to the slot where the extent
1005          * is, and if the backref was not right next to the extent we will be
1006          * deleting at least 1 item, and will want to start searching at the
1007          * slot directly next to extent_slot.  However if we did find the
1008          * backref next to the extent item them we will be deleting at least 2
1009          * items and will want to start searching directly after the ref slot
1010          */
1011         if (!refs) {
1012                 struct list_head *pos, *n, *end;
1013                 int cur_slot = extent_slot+num_to_del;
1014                 u64 super_used;
1015                 u64 root_used;
1016
1017                 path->slots[0] = extent_slot;
1018                 bytes_freed = op->num_bytes;
1019
1020                 mutex_lock(&info->pinned_mutex);
1021                 ret = pin_down_bytes(trans, extent_root, op->bytenr,
1022                                      op->num_bytes, op->level >=
1023                                      BTRFS_FIRST_FREE_OBJECTID);
1024                 mutex_unlock(&info->pinned_mutex);
1025                 BUG_ON(ret < 0);
1026                 op->del = ret;
1027
1028                 /*
1029                  * we need to see if we can delete multiple things at once, so
1030                  * start looping through the list of extents we are wanting to
1031                  * delete and see if their extent/backref's are right next to
1032                  * eachother and the extents only have 1 ref
1033                  */
1034                 for (pos = cur->next; pos != del_list; pos = pos->next) {
1035                         struct pending_extent_op *tmp;
1036
1037                         tmp = list_entry(pos, struct pending_extent_op, list);
1038
1039                         /* we only want to delete extent+ref at this stage */
1040                         if (cur_slot >= btrfs_header_nritems(leaf) - 1)
1041                                 break;
1042
1043                         btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
1044                         if (found_key.objectid != tmp->bytenr ||
1045                             found_key.type != BTRFS_EXTENT_ITEM_KEY ||
1046                             found_key.offset != tmp->num_bytes)
1047                                 break;
1048
1049                         /* check to make sure this extent only has one ref */
1050                         ei = btrfs_item_ptr(leaf, cur_slot,
1051                                             struct btrfs_extent_item);
1052                         if (btrfs_extent_refs(leaf, ei) != 1)
1053                                 break;
1054
1055                         btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
1056                         if (found_key.objectid != tmp->bytenr ||
1057                             found_key.type != BTRFS_EXTENT_REF_KEY ||
1058                             found_key.offset != tmp->orig_parent)
1059                                 break;
1060
1061                         /*
1062                          * the ref is right next to the extent, we can set the
1063                          * ref count to 0 since we will delete them both now
1064                          */
1065                         btrfs_set_extent_refs(leaf, ei, 0);
1066
1067                         /* pin down the bytes for this extent */
1068                         mutex_lock(&info->pinned_mutex);
1069                         ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
1070                                              tmp->num_bytes, tmp->level >=
1071                                              BTRFS_FIRST_FREE_OBJECTID);
1072                         mutex_unlock(&info->pinned_mutex);
1073                         BUG_ON(ret < 0);
1074
1075                         /*
1076                          * use the del field to tell if we need to go ahead and
1077                          * free up the extent when we delete the item or not.
1078                          */
1079                         tmp->del = ret;
1080                         bytes_freed += tmp->num_bytes;
1081
1082                         num_to_del += 2;
1083                         cur_slot += 2;
1084                 }
1085                 end = pos;
1086
1087                 /* update the free space counters */
1088                 spin_lock_irq(&info->delalloc_lock);
1089                 super_used = btrfs_super_bytes_used(&info->super_copy);
1090                 btrfs_set_super_bytes_used(&info->super_copy,
1091                                            super_used - bytes_freed);
1092                 spin_unlock_irq(&info->delalloc_lock);
1093
1094                 root_used = btrfs_root_used(&extent_root->root_item);
1095                 btrfs_set_root_used(&extent_root->root_item,
1096                                     root_used - bytes_freed);
1097
1098                 /* delete the items */
1099                 ret = btrfs_del_items(trans, extent_root, path,
1100                                       path->slots[0], num_to_del);
1101                 BUG_ON(ret);
1102
1103                 /*
1104                  * loop through the extents we deleted and do the cleanup work
1105                  * on them
1106                  */
1107                 for (pos = cur, n = pos->next; pos != end;
1108                      pos = n, n = pos->next) {
1109                         struct pending_extent_op *tmp;
1110 #ifdef BIO_RW_DISCARD
1111                         u64 map_length;
1112                         struct btrfs_multi_bio *multi = NULL;
1113 #endif
1114                         tmp = list_entry(pos, struct pending_extent_op, list);
1115
1116                         /*
1117                          * remember tmp->del tells us wether or not we pinned
1118                          * down the extent
1119                          */
1120                         ret = update_block_group(trans, extent_root,
1121                                                  tmp->bytenr, tmp->num_bytes, 0,
1122                                                  tmp->del);
1123                         BUG_ON(ret);
1124
1125 #ifdef BIO_RW_DISCARD
1126                         map_length = tmp->num_bytes;
1127                         ret = btrfs_map_block(&info->mapping_tree, READ,
1128                                               tmp->bytenr, &map_length, &multi,
1129                                               0);
1130                         if (!ret) {
1131                                 struct btrfs_bio_stripe *stripe;
1132                                 int i;
1133
1134                                 stripe = multi->stripes;
1135
1136                                 if (map_length > tmp->num_bytes)
1137                                         map_length = tmp->num_bytes;
1138
1139                                 for (i = 0; i < multi->num_stripes;
1140                                      i++, stripe++)
1141                                         btrfs_issue_discard(stripe->dev->bdev,
1142                                                             stripe->physical,
1143                                                             map_length);
1144                                 kfree(multi);
1145                         }
1146 #endif
1147                         list_del_init(&tmp->list);
1148                         unlock_extent(&info->extent_ins, tmp->bytenr,
1149                                       tmp->bytenr + tmp->num_bytes - 1,
1150                                       GFP_NOFS);
1151                         kfree(tmp);
1152                 }
1153         } else if (refs && found_extent) {
1154                 /*
1155                  * the ref and extent were right next to eachother, but the
1156                  * extent still has a ref, so just free the backref and keep
1157                  * going
1158                  */
1159                 ret = remove_extent_backref(trans, extent_root, path);
1160                 BUG_ON(ret);
1161
1162                 list_del_init(&op->list);
1163                 unlock_extent(&info->extent_ins, op->bytenr,
1164                               op->bytenr + op->num_bytes - 1, GFP_NOFS);
1165                 kfree(op);
1166         } else {
1167                 /*
1168                  * the extent has multiple refs and the backref we were looking
1169                  * for was not right next to it, so just unlock and go next,
1170                  * we're good to go
1171                  */
1172                 list_del_init(&op->list);
1173                 unlock_extent(&info->extent_ins, op->bytenr,
1174                               op->bytenr + op->num_bytes - 1, GFP_NOFS);
1175                 kfree(op);
1176         }
1177
1178         btrfs_release_path(extent_root, path);
1179         if (!list_empty(del_list))
1180                 goto search;
1181
1182 out:
1183         btrfs_free_path(path);
1184         return ret;
1185 }
1186
1187 static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1188                                      struct btrfs_root *root, u64 bytenr,
1189                                      u64 orig_parent, u64 parent,
1190                                      u64 orig_root, u64 ref_root,
1191                                      u64 orig_generation, u64 ref_generation,
1192                                      u64 owner_objectid)
1193 {
1194         int ret;
1195         struct btrfs_root *extent_root = root->fs_info->extent_root;
1196         struct btrfs_path *path;
1197
1198         if (root == root->fs_info->extent_root) {
1199                 struct pending_extent_op *extent_op;
1200                 u64 num_bytes;
1201
1202                 BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
1203                 num_bytes = btrfs_level_size(root, (int)owner_objectid);
1204                 mutex_lock(&root->fs_info->extent_ins_mutex);
1205                 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
1206                                 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
1207                         u64 priv;
1208                         ret = get_state_private(&root->fs_info->extent_ins,
1209                                                 bytenr, &priv);
1210                         BUG_ON(ret);
1211                         extent_op = (struct pending_extent_op *)
1212                                                         (unsigned long)priv;
1213                         BUG_ON(extent_op->parent != orig_parent);
1214                         BUG_ON(extent_op->generation != orig_generation);
1215
1216                         extent_op->parent = parent;
1217                         extent_op->generation = ref_generation;
1218                 } else {
1219                         extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
1220                         BUG_ON(!extent_op);
1221
1222                         extent_op->type = PENDING_BACKREF_UPDATE;
1223                         extent_op->bytenr = bytenr;
1224                         extent_op->num_bytes = num_bytes;
1225                         extent_op->parent = parent;
1226                         extent_op->orig_parent = orig_parent;
1227                         extent_op->generation = ref_generation;
1228                         extent_op->orig_generation = orig_generation;
1229                         extent_op->level = (int)owner_objectid;
1230                         INIT_LIST_HEAD(&extent_op->list);
1231                         extent_op->del = 0;
1232
1233                         set_extent_bits(&root->fs_info->extent_ins,
1234                                         bytenr, bytenr + num_bytes - 1,
1235                                         EXTENT_WRITEBACK, GFP_NOFS);
1236                         set_state_private(&root->fs_info->extent_ins,
1237                                           bytenr, (unsigned long)extent_op);
1238                 }
1239                 mutex_unlock(&root->fs_info->extent_ins_mutex);
1240                 return 0;
1241         }
1242
1243         path = btrfs_alloc_path();
1244         if (!path)
1245                 return -ENOMEM;
1246         ret = lookup_extent_backref(trans, extent_root, path,
1247                                     bytenr, orig_parent, orig_root,
1248                                     orig_generation, owner_objectid, 1);
1249         if (ret)
1250                 goto out;
1251         ret = remove_extent_backref(trans, extent_root, path);
1252         if (ret)
1253                 goto out;
1254         ret = insert_extent_backref(trans, extent_root, path, bytenr,
1255                                     parent, ref_root, ref_generation,
1256                                     owner_objectid);
1257         BUG_ON(ret);
1258         finish_current_insert(trans, extent_root, 0);
1259         del_pending_extents(trans, extent_root, 0);
1260 out:
1261         btrfs_free_path(path);
1262         return ret;
1263 }
1264
1265 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1266                             struct btrfs_root *root, u64 bytenr,
1267                             u64 orig_parent, u64 parent,
1268                             u64 ref_root, u64 ref_generation,
1269                             u64 owner_objectid)
1270 {
1271         int ret;
1272         if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1273             owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1274                 return 0;
1275         ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
1276                                         parent, ref_root, ref_root,
1277                                         ref_generation, ref_generation,
1278                                         owner_objectid);
1279         return ret;
1280 }
1281
1282 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1283                                   struct btrfs_root *root, u64 bytenr,
1284                                   u64 orig_parent, u64 parent,
1285                                   u64 orig_root, u64 ref_root,
1286                                   u64 orig_generation, u64 ref_generation,
1287                                   u64 owner_objectid)
1288 {
1289         struct btrfs_path *path;
1290         int ret;
1291         struct btrfs_key key;
1292         struct extent_buffer *l;
1293         struct btrfs_extent_item *item;
1294         u32 refs;
1295
1296         path = btrfs_alloc_path();
1297         if (!path)
1298                 return -ENOMEM;
1299
1300         path->reada = 1;
1301         key.objectid = bytenr;
1302         key.type = BTRFS_EXTENT_ITEM_KEY;
1303         key.offset = (u64)-1;
1304
1305         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1306                                 0, 1);
1307         if (ret < 0)
1308                 return ret;
1309         BUG_ON(ret == 0 || path->slots[0] == 0);
1310
1311         path->slots[0]--;
1312         l = path->nodes[0];
1313
1314         btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1315         if (key.objectid != bytenr) {
1316                 btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
1317                 printk("wanted %Lu found %Lu\n", bytenr, key.objectid);
1318                 BUG();
1319         }
1320         BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
1321
1322         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1323         refs = btrfs_extent_refs(l, item);
1324         btrfs_set_extent_refs(l, item, refs + 1);
1325         btrfs_mark_buffer_dirty(path->nodes[0]);
1326
1327         btrfs_release_path(root->fs_info->extent_root, path);
1328
1329         path->reada = 1;
1330         ret = insert_extent_backref(trans, root->fs_info->extent_root,
1331                                     path, bytenr, parent,
1332                                     ref_root, ref_generation,
1333                                     owner_objectid);
1334         BUG_ON(ret);
1335         finish_current_insert(trans, root->fs_info->extent_root, 0);
1336         del_pending_extents(trans, root->fs_info->extent_root, 0);
1337
1338         btrfs_free_path(path);
1339         return 0;
1340 }
1341
1342 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1343                          struct btrfs_root *root,
1344                          u64 bytenr, u64 num_bytes, u64 parent,
1345                          u64 ref_root, u64 ref_generation,
1346                          u64 owner_objectid)
1347 {
1348         int ret;
1349         if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1350             owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1351                 return 0;
1352         ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
1353                                      0, ref_root, 0, ref_generation,
1354                                      owner_objectid);
1355         return ret;
1356 }
1357
1358 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1359                          struct btrfs_root *root)
1360 {
1361         finish_current_insert(trans, root->fs_info->extent_root, 1);
1362         del_pending_extents(trans, root->fs_info->extent_root, 1);
1363         return 0;
1364 }
1365
1366 int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1367                             struct btrfs_root *root, u64 bytenr,
1368                             u64 num_bytes, u32 *refs)
1369 {
1370         struct btrfs_path *path;
1371         int ret;
1372         struct btrfs_key key;
1373         struct extent_buffer *l;
1374         struct btrfs_extent_item *item;
1375
1376         WARN_ON(num_bytes < root->sectorsize);
1377         path = btrfs_alloc_path();
1378         path->reada = 1;
1379         key.objectid = bytenr;
1380         key.offset = num_bytes;
1381         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
1382         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1383                                 0, 0);
1384         if (ret < 0)
1385                 goto out;
1386         if (ret != 0) {
1387                 btrfs_print_leaf(root, path->nodes[0]);
1388                 printk("failed to find block number %Lu\n", bytenr);
1389                 BUG();
1390         }
1391         l = path->nodes[0];
1392         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1393         *refs = btrfs_extent_refs(l, item);
1394 out:
1395         btrfs_free_path(path);
1396         return 0;
1397 }
1398
1399 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1400                           struct btrfs_root *root, u64 bytenr)
1401 {
1402         struct btrfs_root *extent_root = root->fs_info->extent_root;
1403         struct btrfs_path *path;
1404         struct extent_buffer *leaf;
1405         struct btrfs_extent_ref *ref_item;
1406         struct btrfs_key key;
1407         struct btrfs_key found_key;
1408         u64 ref_root;
1409         u64 last_snapshot;
1410         u32 nritems;
1411         int ret;
1412
1413         key.objectid = bytenr;
1414         key.offset = (u64)-1;
1415         key.type = BTRFS_EXTENT_ITEM_KEY;
1416
1417         path = btrfs_alloc_path();
1418         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1419         if (ret < 0)
1420                 goto out;
1421         BUG_ON(ret == 0);
1422
1423         ret = -ENOENT;
1424         if (path->slots[0] == 0)
1425                 goto out;
1426
1427         path->slots[0]--;
1428         leaf = path->nodes[0];
1429         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1430
1431         if (found_key.objectid != bytenr ||
1432             found_key.type != BTRFS_EXTENT_ITEM_KEY)
1433                 goto out;
1434
1435         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1436         while (1) {
1437                 leaf = path->nodes[0];
1438                 nritems = btrfs_header_nritems(leaf);
1439                 if (path->slots[0] >= nritems) {
1440                         ret = btrfs_next_leaf(extent_root, path);
1441                         if (ret < 0)
1442                                 goto out;
1443                         if (ret == 0)
1444                                 continue;
1445                         break;
1446                 }
1447                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1448                 if (found_key.objectid != bytenr)
1449                         break;
1450
1451                 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
1452                         path->slots[0]++;
1453                         continue;
1454                 }
1455
1456                 ref_item = btrfs_item_ptr(leaf, path->slots[0],
1457                                           struct btrfs_extent_ref);
1458                 ref_root = btrfs_ref_root(leaf, ref_item);
1459                 if (ref_root != root->root_key.objectid &&
1460                     ref_root != BTRFS_TREE_LOG_OBJECTID) {
1461                         ret = 1;
1462                         goto out;
1463                 }
1464                 if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
1465                         ret = 1;
1466                         goto out;
1467                 }
1468
1469                 path->slots[0]++;
1470         }
1471         ret = 0;
1472 out:
1473         btrfs_free_path(path);
1474         return ret;
1475 }
1476
1477 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1478                     struct extent_buffer *buf, u32 nr_extents)
1479 {
1480         struct btrfs_key key;
1481         struct btrfs_file_extent_item *fi;
1482         u64 root_gen;
1483         u32 nritems;
1484         int i;
1485         int level;
1486         int ret = 0;
1487         int shared = 0;
1488
1489         if (!root->ref_cows)
1490                 return 0;
1491
1492         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1493                 shared = 0;
1494                 root_gen = root->root_key.offset;
1495         } else {
1496                 shared = 1;
1497                 root_gen = trans->transid - 1;
1498         }
1499
1500         level = btrfs_header_level(buf);
1501         nritems = btrfs_header_nritems(buf);
1502
1503         if (level == 0) {
1504                 struct btrfs_leaf_ref *ref;
1505                 struct btrfs_extent_info *info;
1506
1507                 ref = btrfs_alloc_leaf_ref(root, nr_extents);
1508                 if (!ref) {
1509                         ret = -ENOMEM;
1510                         goto out;
1511                 }
1512
1513                 ref->root_gen = root_gen;
1514                 ref->bytenr = buf->start;
1515                 ref->owner = btrfs_header_owner(buf);
1516                 ref->generation = btrfs_header_generation(buf);
1517                 ref->nritems = nr_extents;
1518                 info = ref->extents;
1519
1520                 for (i = 0; nr_extents > 0 && i < nritems; i++) {
1521                         u64 disk_bytenr;
1522                         btrfs_item_key_to_cpu(buf, &key, i);
1523                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1524                                 continue;
1525                         fi = btrfs_item_ptr(buf, i,
1526                                             struct btrfs_file_extent_item);
1527                         if (btrfs_file_extent_type(buf, fi) ==
1528                             BTRFS_FILE_EXTENT_INLINE)
1529                                 continue;
1530                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1531                         if (disk_bytenr == 0)
1532                                 continue;
1533
1534                         info->bytenr = disk_bytenr;
1535                         info->num_bytes =
1536                                 btrfs_file_extent_disk_num_bytes(buf, fi);
1537                         info->objectid = key.objectid;
1538                         info->offset = key.offset;
1539                         info++;
1540                 }
1541
1542                 ret = btrfs_add_leaf_ref(root, ref, shared);
1543                 if (ret == -EEXIST && shared) {
1544                         struct btrfs_leaf_ref *old;
1545                         old = btrfs_lookup_leaf_ref(root, ref->bytenr);
1546                         BUG_ON(!old);
1547                         btrfs_remove_leaf_ref(root, old);
1548                         btrfs_free_leaf_ref(root, old);
1549                         ret = btrfs_add_leaf_ref(root, ref, shared);
1550                 }
1551                 WARN_ON(ret);
1552                 btrfs_free_leaf_ref(root, ref);
1553         }
1554 out:
1555         return ret;
1556 }
1557
1558 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1559                   struct extent_buffer *orig_buf, struct extent_buffer *buf,
1560                   u32 *nr_extents)
1561 {
1562         u64 bytenr;
1563         u64 ref_root;
1564         u64 orig_root;
1565         u64 ref_generation;
1566         u64 orig_generation;
1567         u32 nritems;
1568         u32 nr_file_extents = 0;
1569         struct btrfs_key key;
1570         struct btrfs_file_extent_item *fi;
1571         int i;
1572         int level;
1573         int ret = 0;
1574         int faili = 0;
1575         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1576                             u64, u64, u64, u64, u64, u64, u64, u64);
1577
1578         ref_root = btrfs_header_owner(buf);
1579         ref_generation = btrfs_header_generation(buf);
1580         orig_root = btrfs_header_owner(orig_buf);
1581         orig_generation = btrfs_header_generation(orig_buf);
1582
1583         nritems = btrfs_header_nritems(buf);
1584         level = btrfs_header_level(buf);
1585
1586         if (root->ref_cows) {
1587                 process_func = __btrfs_inc_extent_ref;
1588         } else {
1589                 if (level == 0 &&
1590                     root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1591                         goto out;
1592                 if (level != 0 &&
1593                     root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1594                         goto out;
1595                 process_func = __btrfs_update_extent_ref;
1596         }
1597
1598         for (i = 0; i < nritems; i++) {
1599                 cond_resched();
1600                 if (level == 0) {
1601                         btrfs_item_key_to_cpu(buf, &key, i);
1602                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1603                                 continue;
1604                         fi = btrfs_item_ptr(buf, i,
1605                                             struct btrfs_file_extent_item);
1606                         if (btrfs_file_extent_type(buf, fi) ==
1607                             BTRFS_FILE_EXTENT_INLINE)
1608                                 continue;
1609                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1610                         if (bytenr == 0)
1611                                 continue;
1612
1613                         nr_file_extents++;
1614
1615                         ret = process_func(trans, root, bytenr,
1616                                            orig_buf->start, buf->start,
1617                                            orig_root, ref_root,
1618                                            orig_generation, ref_generation,
1619                                            key.objectid);
1620
1621                         if (ret) {
1622                                 faili = i;
1623                                 WARN_ON(1);
1624                                 goto fail;
1625                         }
1626                 } else {
1627                         bytenr = btrfs_node_blockptr(buf, i);
1628                         ret = process_func(trans, root, bytenr,
1629                                            orig_buf->start, buf->start,
1630                                            orig_root, ref_root,
1631                                            orig_generation, ref_generation,
1632                                            level - 1);
1633                         if (ret) {
1634                                 faili = i;
1635                                 WARN_ON(1);
1636                                 goto fail;
1637                         }
1638                 }
1639         }
1640 out:
1641         if (nr_extents) {
1642                 if (level == 0)
1643                         *nr_extents = nr_file_extents;
1644                 else
1645                         *nr_extents = nritems;
1646         }
1647         return 0;
1648 fail:
1649         WARN_ON(1);
1650         return ret;
1651 }
1652
1653 int btrfs_update_ref(struct btrfs_trans_handle *trans,
1654                      struct btrfs_root *root, struct extent_buffer *orig_buf,
1655                      struct extent_buffer *buf, int start_slot, int nr)
1656
1657 {
1658         u64 bytenr;
1659         u64 ref_root;
1660         u64 orig_root;
1661         u64 ref_generation;
1662         u64 orig_generation;
1663         struct btrfs_key key;
1664         struct btrfs_file_extent_item *fi;
1665         int i;
1666         int ret;
1667         int slot;
1668         int level;
1669
1670         BUG_ON(start_slot < 0);
1671         BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
1672
1673         ref_root = btrfs_header_owner(buf);
1674         ref_generation = btrfs_header_generation(buf);
1675         orig_root = btrfs_header_owner(orig_buf);
1676         orig_generation = btrfs_header_generation(orig_buf);
1677         level = btrfs_header_level(buf);
1678
1679         if (!root->ref_cows) {
1680                 if (level == 0 &&
1681                     root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1682                         return 0;
1683                 if (level != 0 &&
1684                     root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1685                         return 0;
1686         }
1687
1688         for (i = 0, slot = start_slot; i < nr; i++, slot++) {
1689                 cond_resched();
1690                 if (level == 0) {
1691                         btrfs_item_key_to_cpu(buf, &key, slot);
1692                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1693                                 continue;
1694                         fi = btrfs_item_ptr(buf, slot,
1695                                             struct btrfs_file_extent_item);
1696                         if (btrfs_file_extent_type(buf, fi) ==
1697                             BTRFS_FILE_EXTENT_INLINE)
1698                                 continue;
1699                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1700                         if (bytenr == 0)
1701                                 continue;
1702                         ret = __btrfs_update_extent_ref(trans, root, bytenr,
1703                                             orig_buf->start, buf->start,
1704                                             orig_root, ref_root,
1705                                             orig_generation, ref_generation,
1706                                             key.objectid);
1707                         if (ret)
1708                                 goto fail;
1709                 } else {
1710                         bytenr = btrfs_node_blockptr(buf, slot);
1711                         ret = __btrfs_update_extent_ref(trans, root, bytenr,
1712                                             orig_buf->start, buf->start,
1713                                             orig_root, ref_root,
1714                                             orig_generation, ref_generation,
1715                                             level - 1);
1716                         if (ret)
1717                                 goto fail;
1718                 }
1719         }
1720         return 0;
1721 fail:
1722         WARN_ON(1);
1723         return -1;
1724 }
1725
1726 static int write_one_cache_group(struct btrfs_trans_handle *trans,
1727                                  struct btrfs_root *root,
1728                                  struct btrfs_path *path,
1729                                  struct btrfs_block_group_cache *cache)
1730 {
1731         int ret;
1732         int pending_ret;
1733         struct btrfs_root *extent_root = root->fs_info->extent_root;
1734         unsigned long bi;
1735         struct extent_buffer *leaf;
1736
1737         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
1738         if (ret < 0)
1739                 goto fail;
1740         BUG_ON(ret);
1741
1742         leaf = path->nodes[0];
1743         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
1744         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
1745         btrfs_mark_buffer_dirty(leaf);
1746         btrfs_release_path(extent_root, path);
1747 fail:
1748         finish_current_insert(trans, extent_root, 0);
1749         pending_ret = del_pending_extents(trans, extent_root, 0);
1750         if (ret)
1751                 return ret;
1752         if (pending_ret)
1753                 return pending_ret;
1754         return 0;
1755
1756 }
1757
1758 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1759                                    struct btrfs_root *root)
1760 {
1761         struct btrfs_block_group_cache *cache, *entry;
1762         struct rb_node *n;
1763         int err = 0;
1764         int werr = 0;
1765         struct btrfs_path *path;
1766         u64 last = 0;
1767
1768         path = btrfs_alloc_path();
1769         if (!path)
1770                 return -ENOMEM;
1771
1772         while(1) {
1773                 cache = NULL;
1774                 spin_lock(&root->fs_info->block_group_cache_lock);
1775                 for (n = rb_first(&root->fs_info->block_group_cache_tree);
1776                      n; n = rb_next(n)) {
1777                         entry = rb_entry(n, struct btrfs_block_group_cache,
1778                                          cache_node);
1779                         if (entry->dirty) {
1780                                 cache = entry;
1781                                 break;
1782                         }
1783                 }
1784                 spin_unlock(&root->fs_info->block_group_cache_lock);
1785
1786                 if (!cache)
1787                         break;
1788
1789                 cache->dirty = 0;
1790                 last += cache->key.offset;
1791
1792                 err = write_one_cache_group(trans, root,
1793                                             path, cache);
1794                 /*
1795                  * if we fail to write the cache group, we want
1796                  * to keep it marked dirty in hopes that a later
1797                  * write will work
1798                  */
1799                 if (err) {
1800                         werr = err;
1801                         continue;
1802                 }
1803         }
1804         btrfs_free_path(path);
1805         return werr;
1806 }
1807
1808 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1809                              u64 total_bytes, u64 bytes_used,
1810                              struct btrfs_space_info **space_info)
1811 {
1812         struct btrfs_space_info *found;
1813
1814         found = __find_space_info(info, flags);
1815         if (found) {
1816                 spin_lock(&found->lock);
1817                 found->total_bytes += total_bytes;
1818                 found->bytes_used += bytes_used;
1819                 found->full = 0;
1820                 spin_unlock(&found->lock);
1821                 *space_info = found;
1822                 return 0;
1823         }
1824         found = kzalloc(sizeof(*found), GFP_NOFS);
1825         if (!found)
1826                 return -ENOMEM;
1827
1828         list_add(&found->list, &info->space_info);
1829         INIT_LIST_HEAD(&found->block_groups);
1830         init_rwsem(&found->groups_sem);
1831         spin_lock_init(&found->lock);
1832         found->flags = flags;
1833         found->total_bytes = total_bytes;
1834         found->bytes_used = bytes_used;
1835         found->bytes_pinned = 0;
1836         found->bytes_reserved = 0;
1837         found->bytes_readonly = 0;
1838         found->full = 0;
1839         found->force_alloc = 0;
1840         *space_info = found;
1841         return 0;
1842 }
1843
1844 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1845 {
1846         u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1847                                    BTRFS_BLOCK_GROUP_RAID1 |
1848                                    BTRFS_BLOCK_GROUP_RAID10 |
1849                                    BTRFS_BLOCK_GROUP_DUP);
1850         if (extra_flags) {
1851                 if (flags & BTRFS_BLOCK_GROUP_DATA)
1852                         fs_info->avail_data_alloc_bits |= extra_flags;
1853                 if (flags & BTRFS_BLOCK_GROUP_METADATA)
1854                         fs_info->avail_metadata_alloc_bits |= extra_flags;
1855                 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1856                         fs_info->avail_system_alloc_bits |= extra_flags;
1857         }
1858 }
1859
1860 static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
1861 {
1862         spin_lock(&cache->space_info->lock);
1863         spin_lock(&cache->lock);
1864         if (!cache->ro) {
1865                 cache->space_info->bytes_readonly += cache->key.offset -
1866                                         btrfs_block_group_used(&cache->item);
1867                 cache->ro = 1;
1868         }
1869         spin_unlock(&cache->lock);
1870         spin_unlock(&cache->space_info->lock);
1871 }
1872
1873 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1874 {
1875         u64 num_devices = root->fs_info->fs_devices->rw_devices;
1876
1877         if (num_devices == 1)
1878                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1879         if (num_devices < 4)
1880                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1881
1882         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1883             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1884                       BTRFS_BLOCK_GROUP_RAID10))) {
1885                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
1886         }
1887
1888         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1889             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1890                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1891         }
1892
1893         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1894             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1895              (flags & BTRFS_BLOCK_GROUP_RAID10) |
1896              (flags & BTRFS_BLOCK_GROUP_DUP)))
1897                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1898         return flags;
1899 }
1900
1901 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1902                           struct btrfs_root *extent_root, u64 alloc_bytes,
1903                           u64 flags, int force)
1904 {
1905         struct btrfs_space_info *space_info;
1906         u64 thresh;
1907         int ret = 0;
1908
1909         mutex_lock(&extent_root->fs_info->chunk_mutex);
1910
1911         flags = btrfs_reduce_alloc_profile(extent_root, flags);
1912
1913         space_info = __find_space_info(extent_root->fs_info, flags);
1914         if (!space_info) {
1915                 ret = update_space_info(extent_root->fs_info, flags,
1916                                         0, 0, &space_info);
1917                 BUG_ON(ret);
1918         }
1919         BUG_ON(!space_info);
1920
1921         spin_lock(&space_info->lock);
1922         if (space_info->force_alloc) {
1923                 force = 1;
1924                 space_info->force_alloc = 0;
1925         }
1926         if (space_info->full) {
1927                 spin_unlock(&space_info->lock);
1928                 goto out;
1929         }
1930
1931         thresh = space_info->total_bytes - space_info->bytes_readonly;
1932         thresh = div_factor(thresh, 6);
1933         if (!force &&
1934            (space_info->bytes_used + space_info->bytes_pinned +
1935             space_info->bytes_reserved + alloc_bytes) < thresh) {
1936                 spin_unlock(&space_info->lock);
1937                 goto out;
1938         }
1939         spin_unlock(&space_info->lock);
1940
1941         ret = btrfs_alloc_chunk(trans, extent_root, flags);
1942         if (ret) {
1943 printk("space info full %Lu\n", flags);
1944                 space_info->full = 1;
1945         }
1946 out:
1947         mutex_unlock(&extent_root->fs_info->chunk_mutex);
1948         return ret;
1949 }
1950
1951 static int update_block_group(struct btrfs_trans_handle *trans,
1952                               struct btrfs_root *root,
1953                               u64 bytenr, u64 num_bytes, int alloc,
1954                               int mark_free)
1955 {
1956         struct btrfs_block_group_cache *cache;
1957         struct btrfs_fs_info *info = root->fs_info;
1958         u64 total = num_bytes;
1959         u64 old_val;
1960         u64 byte_in_group;
1961
1962         while(total) {
1963                 cache = btrfs_lookup_block_group(info, bytenr);
1964                 if (!cache)
1965                         return -1;
1966                 byte_in_group = bytenr - cache->key.objectid;
1967                 WARN_ON(byte_in_group > cache->key.offset);
1968
1969                 spin_lock(&cache->space_info->lock);
1970                 spin_lock(&cache->lock);
1971                 cache->dirty = 1;
1972                 old_val = btrfs_block_group_used(&cache->item);
1973                 num_bytes = min(total, cache->key.offset - byte_in_group);
1974                 if (alloc) {
1975                         old_val += num_bytes;
1976                         cache->space_info->bytes_used += num_bytes;
1977                         if (cache->ro) {
1978                                 cache->space_info->bytes_readonly -= num_bytes;
1979                                 WARN_ON(1);
1980                         }
1981                         btrfs_set_block_group_used(&cache->item, old_val);
1982                         spin_unlock(&cache->lock);
1983                         spin_unlock(&cache->space_info->lock);
1984                 } else {
1985                         old_val -= num_bytes;
1986                         cache->space_info->bytes_used -= num_bytes;
1987                         if (cache->ro)
1988                                 cache->space_info->bytes_readonly += num_bytes;
1989                         btrfs_set_block_group_used(&cache->item, old_val);
1990                         spin_unlock(&cache->lock);
1991                         spin_unlock(&cache->space_info->lock);
1992                         if (mark_free) {
1993                                 int ret;
1994                                 ret = btrfs_add_free_space(cache, bytenr,
1995                                                            num_bytes);
1996                                 if (ret)
1997                                         return -1;
1998                         }
1999                 }
2000                 total -= num_bytes;
2001                 bytenr += num_bytes;
2002         }
2003         return 0;
2004 }
2005
2006 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
2007 {
2008         struct btrfs_block_group_cache *cache;
2009
2010         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
2011         if (!cache)
2012                 return 0;
2013
2014         return cache->key.objectid;
2015 }
2016
2017 int btrfs_update_pinned_extents(struct btrfs_root *root,
2018                                 u64 bytenr, u64 num, int pin)
2019 {
2020         u64 len;
2021         struct btrfs_block_group_cache *cache;
2022         struct btrfs_fs_info *fs_info = root->fs_info;
2023
2024         WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2025         if (pin) {
2026                 set_extent_dirty(&fs_info->pinned_extents,
2027                                 bytenr, bytenr + num - 1, GFP_NOFS);
2028         } else {
2029                 clear_extent_dirty(&fs_info->pinned_extents,
2030                                 bytenr, bytenr + num - 1, GFP_NOFS);
2031         }
2032         while (num > 0) {
2033                 cache = btrfs_lookup_block_group(fs_info, bytenr);
2034                 BUG_ON(!cache);
2035                 len = min(num, cache->key.offset -
2036                           (bytenr - cache->key.objectid));
2037                 if (pin) {
2038                         spin_lock(&cache->space_info->lock);
2039                         spin_lock(&cache->lock);
2040                         cache->pinned += len;
2041                         cache->space_info->bytes_pinned += len;
2042                         spin_unlock(&cache->lock);
2043                         spin_unlock(&cache->space_info->lock);
2044                         fs_info->total_pinned += len;
2045                 } else {
2046                         spin_lock(&cache->space_info->lock);
2047                         spin_lock(&cache->lock);
2048                         cache->pinned -= len;
2049                         cache->space_info->bytes_pinned -= len;
2050                         spin_unlock(&cache->lock);
2051                         spin_unlock(&cache->space_info->lock);
2052                         fs_info->total_pinned -= len;
2053                         if (cache->cached)
2054                                 btrfs_add_free_space(cache, bytenr, len);
2055                 }
2056                 bytenr += len;
2057                 num -= len;
2058         }
2059         return 0;
2060 }
2061
2062 static int update_reserved_extents(struct btrfs_root *root,
2063                                    u64 bytenr, u64 num, int reserve)
2064 {
2065         u64 len;
2066         struct btrfs_block_group_cache *cache;
2067         struct btrfs_fs_info *fs_info = root->fs_info;
2068
2069         while (num > 0) {
2070                 cache = btrfs_lookup_block_group(fs_info, bytenr);
2071                 BUG_ON(!cache);
2072                 len = min(num, cache->key.offset -
2073                           (bytenr - cache->key.objectid));
2074
2075                 spin_lock(&cache->space_info->lock);
2076                 spin_lock(&cache->lock);
2077                 if (reserve) {
2078                         cache->reserved += len;
2079                         cache->space_info->bytes_reserved += len;
2080                 } else {
2081                         cache->reserved -= len;
2082                         cache->space_info->bytes_reserved -= len;
2083                 }
2084                 spin_unlock(&cache->lock);
2085                 spin_unlock(&cache->space_info->lock);
2086                 bytenr += len;
2087                 num -= len;
2088         }
2089         return 0;
2090 }
2091
2092 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2093 {
2094         u64 last = 0;
2095         u64 start;
2096         u64 end;
2097         struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2098         int ret;
2099
2100         mutex_lock(&root->fs_info->pinned_mutex);
2101         while(1) {
2102                 ret = find_first_extent_bit(pinned_extents, last,
2103                                             &start, &end, EXTENT_DIRTY);
2104                 if (ret)
2105                         break;
2106                 set_extent_dirty(copy, start, end, GFP_NOFS);
2107                 last = end + 1;
2108         }
2109         mutex_unlock(&root->fs_info->pinned_mutex);
2110         return 0;
2111 }
2112
2113 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2114                                struct btrfs_root *root,
2115                                struct extent_io_tree *unpin)
2116 {
2117         u64 start;
2118         u64 end;
2119         int ret;
2120
2121         mutex_lock(&root->fs_info->pinned_mutex);
2122         while(1) {
2123                 ret = find_first_extent_bit(unpin, 0, &start, &end,
2124                                             EXTENT_DIRTY);
2125                 if (ret)
2126                         break;
2127                 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2128                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2129                 if (need_resched()) {
2130                         mutex_unlock(&root->fs_info->pinned_mutex);
2131                         cond_resched();
2132                         mutex_lock(&root->fs_info->pinned_mutex);
2133                 }
2134         }
2135         mutex_unlock(&root->fs_info->pinned_mutex);
2136         return 0;
2137 }
2138
2139 static int finish_current_insert(struct btrfs_trans_handle *trans,
2140                                  struct btrfs_root *extent_root, int all)
2141 {
2142         u64 start;
2143         u64 end;
2144         u64 priv;
2145         u64 search = 0;
2146         u64 skipped = 0;
2147         struct btrfs_fs_info *info = extent_root->fs_info;
2148         struct btrfs_path *path;
2149         struct pending_extent_op *extent_op, *tmp;
2150         struct list_head insert_list, update_list;
2151         int ret;
2152         int num_inserts = 0, max_inserts;
2153
2154         path = btrfs_alloc_path();
2155         INIT_LIST_HEAD(&insert_list);
2156         INIT_LIST_HEAD(&update_list);
2157
2158         max_inserts = extent_root->leafsize /
2159                 (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
2160                  sizeof(struct btrfs_extent_ref) +
2161                  sizeof(struct btrfs_extent_item));
2162 again:
2163         mutex_lock(&info->extent_ins_mutex);
2164         while (1) {
2165                 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2166                                             &end, EXTENT_WRITEBACK);
2167                 if (ret) {
2168                         if (skipped && all && !num_inserts) {
2169                                 skipped = 0;
2170                                 search = 0;
2171                                 continue;
2172                         }
2173                         mutex_unlock(&info->extent_ins_mutex);
2174                         break;
2175                 }
2176
2177                 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2178                 if (!ret) {
2179                         skipped = 1;
2180                         search = end + 1;
2181                         if (need_resched()) {
2182                                 mutex_unlock(&info->extent_ins_mutex);
2183                                 cond_resched();
2184                                 mutex_lock(&info->extent_ins_mutex);
2185                         }
2186                         continue;
2187                 }
2188
2189                 ret = get_state_private(&info->extent_ins, start, &priv);
2190                 BUG_ON(ret);
2191                 extent_op = (struct pending_extent_op *)(unsigned long) priv;
2192
2193                 if (extent_op->type == PENDING_EXTENT_INSERT) {
2194                         num_inserts++;
2195                         list_add_tail(&extent_op->list, &insert_list);
2196                         search = end + 1;
2197                         if (num_inserts == max_inserts) {
2198                                 mutex_unlock(&info->extent_ins_mutex);
2199                                 break;
2200                         }
2201                 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
2202                         list_add_tail(&extent_op->list, &update_list);
2203                         search = end + 1;
2204                 } else {
2205                         BUG();
2206                 }
2207         }
2208
2209         /*
2210          * process the update list, clear the writeback bit for it, and if
2211          * somebody marked this thing for deletion then just unlock it and be
2212          * done, the free_extents will handle it
2213          */
2214         mutex_lock(&info->extent_ins_mutex);
2215         list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2216                 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2217                                   extent_op->bytenr + extent_op->num_bytes - 1,
2218                                   EXTENT_WRITEBACK, GFP_NOFS);
2219                 if (extent_op->del) {
2220                         list_del_init(&extent_op->list);
2221                         unlock_extent(&info->extent_ins, extent_op->bytenr,
2222                                       extent_op->bytenr + extent_op->num_bytes
2223                                       - 1, GFP_NOFS);
2224                         kfree(extent_op);
2225                 }
2226         }
2227         mutex_unlock(&info->extent_ins_mutex);
2228
2229         /*
2230          * still have things left on the update list, go ahead an update
2231          * everything
2232          */
2233         if (!list_empty(&update_list)) {
2234                 ret = update_backrefs(trans, extent_root, path, &update_list);
2235                 BUG_ON(ret);
2236         }
2237
2238         /*
2239          * if no inserts need to be done, but we skipped some extents and we
2240          * need to make sure everything is cleaned then reset everything and
2241          * go back to the beginning
2242          */
2243         if (!num_inserts && all && skipped) {
2244                 search = 0;
2245                 skipped = 0;
2246                 INIT_LIST_HEAD(&update_list);
2247                 INIT_LIST_HEAD(&insert_list);
2248                 goto again;
2249         } else if (!num_inserts) {
2250                 goto out;
2251         }
2252
2253         /*
2254          * process the insert extents list.  Again if we are deleting this
2255          * extent, then just unlock it, pin down the bytes if need be, and be
2256          * done with it.  Saves us from having to actually insert the extent
2257          * into the tree and then subsequently come along and delete it
2258          */
2259         mutex_lock(&info->extent_ins_mutex);
2260         list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
2261                 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2262                                   extent_op->bytenr + extent_op->num_bytes - 1,
2263                                   EXTENT_WRITEBACK, GFP_NOFS);
2264                 if (extent_op->del) {
2265                         list_del_init(&extent_op->list);
2266                         unlock_extent(&info->extent_ins, extent_op->bytenr,
2267                                       extent_op->bytenr + extent_op->num_bytes
2268                                       - 1, GFP_NOFS);
2269
2270                         mutex_lock(&extent_root->fs_info->pinned_mutex);
2271                         ret = pin_down_bytes(trans, extent_root,
2272                                              extent_op->bytenr,
2273                                              extent_op->num_bytes, 0);
2274                         mutex_unlock(&extent_root->fs_info->pinned_mutex);
2275
2276                         ret = update_block_group(trans, extent_root,
2277                                                  extent_op->bytenr,
2278                                                  extent_op->num_bytes,
2279                                                  0, ret > 0);
2280                         BUG_ON(ret);
2281                         kfree(extent_op);
2282                         num_inserts--;
2283                 }
2284         }
2285         mutex_unlock(&info->extent_ins_mutex);
2286
2287         ret = insert_extents(trans, extent_root, path, &insert_list,
2288                              num_inserts);
2289         BUG_ON(ret);
2290
2291         /*
2292          * if we broke out of the loop in order to insert stuff because we hit
2293          * the maximum number of inserts at a time we can handle, then loop
2294          * back and pick up where we left off
2295          */
2296         if (num_inserts == max_inserts) {
2297                 INIT_LIST_HEAD(&insert_list);
2298                 INIT_LIST_HEAD(&update_list);
2299                 num_inserts = 0;
2300                 goto again;
2301         }
2302
2303         /*
2304          * again, if we need to make absolutely sure there are no more pending
2305          * extent operations left and we know that we skipped some, go back to
2306          * the beginning and do it all again
2307          */
2308         if (all && skipped) {
2309                 INIT_LIST_HEAD(&insert_list);
2310                 INIT_LIST_HEAD(&update_list);
2311                 search = 0;
2312                 skipped = 0;
2313                 num_inserts = 0;
2314                 goto again;
2315         }
2316 out:
2317         btrfs_free_path(path);
2318         return 0;
2319 }
2320
2321 static int pin_down_bytes(struct btrfs_trans_handle *trans,
2322                           struct btrfs_root *root,
2323                           u64 bytenr, u64 num_bytes, int is_data)
2324 {
2325         int err = 0;
2326         struct extent_buffer *buf;
2327
2328         if (is_data)
2329                 goto pinit;
2330
2331         buf = btrfs_find_tree_block(root, bytenr, num_bytes);
2332         if (!buf)
2333                 goto pinit;
2334
2335         /* we can reuse a block if it hasn't been written
2336          * and it is from this transaction.  We can't
2337          * reuse anything from the tree log root because
2338          * it has tiny sub-transactions.
2339          */
2340         if (btrfs_buffer_uptodate(buf, 0) &&
2341             btrfs_try_tree_lock(buf)) {
2342                 u64 header_owner = btrfs_header_owner(buf);
2343                 u64 header_transid = btrfs_header_generation(buf);
2344                 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2345                     header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2346                     header_transid == trans->transid &&
2347                     !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2348                         clean_tree_block(NULL, root, buf);
2349                         btrfs_tree_unlock(buf);
2350                         free_extent_buffer(buf);
2351                         return 1;
2352                 }
2353                 btrfs_tree_unlock(buf);
2354         }
2355         free_extent_buffer(buf);
2356 pinit:
2357         btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2358
2359         BUG_ON(err < 0);
2360         return 0;
2361 }
2362
2363 /*
2364  * remove an extent from the root, returns 0 on success
2365  */
2366 static int __free_extent(struct btrfs_trans_handle *trans,
2367                          struct btrfs_root *root,
2368                          u64 bytenr, u64 num_bytes, u64 parent,
2369                          u64 root_objectid, u64 ref_generation,
2370                          u64 owner_objectid, int pin, int mark_free)
2371 {
2372         struct btrfs_path *path;
2373         struct btrfs_key key;
2374         struct btrfs_fs_info *info = root->fs_info;
2375         struct btrfs_root *extent_root = info->extent_root;
2376         struct extent_buffer *leaf;
2377         int ret;
2378         int extent_slot = 0;
2379         int found_extent = 0;
2380         int num_to_del = 1;
2381         struct btrfs_extent_item *ei;
2382         u32 refs;
2383
2384         key.objectid = bytenr;
2385         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
2386         key.offset = num_bytes;
2387         path = btrfs_alloc_path();
2388         if (!path)
2389                 return -ENOMEM;
2390
2391         path->reada = 1;
2392         ret = lookup_extent_backref(trans, extent_root, path,
2393                                     bytenr, parent, root_objectid,
2394                                     ref_generation, owner_objectid, 1);
2395         if (ret == 0) {
2396                 struct btrfs_key found_key;
2397                 extent_slot = path->slots[0];
2398                 while(extent_slot > 0) {
2399                         extent_slot--;
2400                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2401                                               extent_slot);
2402                         if (found_key.objectid != bytenr)
2403                                 break;
2404                         if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
2405                             found_key.offset == num_bytes) {
2406                                 found_extent = 1;
2407                                 break;
2408                         }
2409                         if (path->slots[0] - extent_slot > 5)
2410                                 break;
2411                 }
2412                 if (!found_extent) {
2413                         ret = remove_extent_backref(trans, extent_root, path);
2414                         BUG_ON(ret);
2415                         btrfs_release_path(extent_root, path);
2416                         ret = btrfs_search_slot(trans, extent_root,
2417                                                 &key, path, -1, 1);
2418                         if (ret) {
2419                                 printk(KERN_ERR "umm, got %d back from search"
2420                                        ", was looking for %Lu\n", ret,
2421                                        bytenr);
2422                                 btrfs_print_leaf(extent_root, path->nodes[0]);
2423                         }
2424                         BUG_ON(ret);
2425                         extent_slot = path->slots[0];
2426                 }
2427         } else {
2428                 btrfs_print_leaf(extent_root, path->nodes[0]);
2429                 WARN_ON(1);
2430                 printk("Unable to find ref byte nr %Lu root %Lu "
2431                        "gen %Lu owner %Lu\n", bytenr,
2432                        root_objectid, ref_generation, owner_objectid);
2433         }
2434
2435         leaf = path->nodes[0];
2436         ei = btrfs_item_ptr(leaf, extent_slot,
2437                             struct btrfs_extent_item);
2438         refs = btrfs_extent_refs(leaf, ei);
2439         BUG_ON(refs == 0);
2440         refs -= 1;
2441         btrfs_set_extent_refs(leaf, ei, refs);
2442
2443         btrfs_mark_buffer_dirty(leaf);
2444
2445         if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
2446                 struct btrfs_extent_ref *ref;
2447                 ref = btrfs_item_ptr(leaf, path->slots[0],
2448                                      struct btrfs_extent_ref);
2449                 BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
2450                 /* if the back ref and the extent are next to each other
2451                  * they get deleted below in one shot
2452                  */
2453                 path->slots[0] = extent_slot;
2454                 num_to_del = 2;
2455         } else if (found_extent) {
2456                 /* otherwise delete the extent back ref */
2457                 ret = remove_extent_backref(trans, extent_root, path);
2458                 BUG_ON(ret);
2459                 /* if refs are 0, we need to setup the path for deletion */
2460                 if (refs == 0) {
2461                         btrfs_release_path(extent_root, path);
2462                         ret = btrfs_search_slot(trans, extent_root, &key, path,
2463                                                 -1, 1);
2464                         BUG_ON(ret);
2465                 }
2466         }
2467
2468         if (refs == 0) {
2469                 u64 super_used;
2470                 u64 root_used;
2471 #ifdef BIO_RW_DISCARD
2472                 u64 map_length = num_bytes;
2473                 struct btrfs_multi_bio *multi = NULL;
2474 #endif
2475
2476                 if (pin) {
2477                         mutex_lock(&root->fs_info->pinned_mutex);
2478                         ret = pin_down_bytes(trans, root, bytenr, num_bytes,
2479                                 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
2480                         mutex_unlock(&root->fs_info->pinned_mutex);
2481                         if (ret > 0)
2482                                 mark_free = 1;
2483                         BUG_ON(ret < 0);
2484                 }
2485
2486                 /* block accounting for super block */
2487                 spin_lock_irq(&info->delalloc_lock);
2488                 super_used = btrfs_super_bytes_used(&info->super_copy);
2489                 btrfs_set_super_bytes_used(&info->super_copy,
2490                                            super_used - num_bytes);
2491                 spin_unlock_irq(&info->delalloc_lock);
2492
2493                 /* block accounting for root item */
2494                 root_used = btrfs_root_used(&root->root_item);
2495                 btrfs_set_root_used(&root->root_item,
2496                                            root_used - num_bytes);
2497                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2498                                       num_to_del);
2499                 BUG_ON(ret);
2500                 btrfs_release_path(extent_root, path);
2501                 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
2502                                          mark_free);
2503                 BUG_ON(ret);
2504
2505 #ifdef BIO_RW_DISCARD
2506                 /* Tell the block device(s) that the sectors can be discarded */
2507                 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
2508                                       bytenr, &map_length, &multi, 0);
2509                 if (!ret) {
2510                         struct btrfs_bio_stripe *stripe = multi->stripes;
2511                         int i;
2512
2513                         if (map_length > num_bytes)
2514                                 map_length = num_bytes;
2515
2516                         for (i = 0; i < multi->num_stripes; i++, stripe++) {
2517                                 btrfs_issue_discard(stripe->dev->bdev,
2518                                                     stripe->physical,
2519                                                      map_length);
2520                         }
2521                         kfree(multi);
2522                 }
2523 #endif
2524         }
2525         btrfs_free_path(path);
2526         finish_current_insert(trans, extent_root, 0);
2527         return ret;
2528 }
2529
2530 /*
2531  * find all the blocks marked as pending in the radix tree and remove
2532  * them from the extent map
2533  */
2534 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
2535                                btrfs_root *extent_root, int all)
2536 {
2537         int ret;
2538         int err = 0;
2539         u64 start;
2540         u64 end;
2541         u64 priv;
2542         u64 search = 0;
2543         int nr = 0, skipped = 0;
2544         struct extent_io_tree *pending_del;
2545         struct extent_io_tree *extent_ins;
2546         struct pending_extent_op *extent_op;
2547         struct btrfs_fs_info *info = extent_root->fs_info;
2548         struct list_head delete_list;
2549
2550         INIT_LIST_HEAD(&delete_list);
2551         extent_ins = &extent_root->fs_info->extent_ins;
2552         pending_del = &extent_root->fs_info->pending_del;
2553
2554 again:
2555         mutex_lock(&info->extent_ins_mutex);
2556         while(1) {
2557                 ret = find_first_extent_bit(pending_del, search, &start, &end,
2558                                             EXTENT_WRITEBACK);
2559                 if (ret) {
2560                         if (all && skipped && !nr) {
2561                                 search = 0;
2562                                 continue;
2563                         }
2564                         mutex_unlock(&info->extent_ins_mutex);
2565                         break;
2566                 }
2567
2568                 ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
2569                 if (!ret) {
2570                         search = end+1;
2571                         skipped = 1;
2572
2573                         if (need_resched()) {
2574                                 mutex_unlock(&info->extent_ins_mutex);
2575                                 cond_resched();
2576                                 mutex_lock(&info->extent_ins_mutex);
2577                         }
2578
2579                         continue;
2580                 }
2581                 BUG_ON(ret < 0);
2582
2583                 ret = get_state_private(pending_del, start, &priv);
2584                 BUG_ON(ret);
2585                 extent_op = (struct pending_extent_op *)(unsigned long)priv;
2586
2587                 clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
2588                                   GFP_NOFS);
2589                 if (!test_range_bit(extent_ins, start, end,
2590                                     EXTENT_WRITEBACK, 0)) {
2591                         list_add_tail(&extent_op->list, &delete_list);
2592                         nr++;
2593                 } else {
2594                         kfree(extent_op);
2595
2596                         ret = get_state_private(&info->extent_ins, start,
2597                                                 &priv);
2598                         BUG_ON(ret);
2599                         extent_op = (struct pending_extent_op *)
2600                                                 (unsigned long)priv;
2601
2602                         clear_extent_bits(&info->extent_ins, start, end,
2603                                           EXTENT_WRITEBACK, GFP_NOFS);
2604
2605                         if (extent_op->type == PENDING_BACKREF_UPDATE) {
2606                                 list_add_tail(&extent_op->list, &delete_list);
2607                                 search = end + 1;
2608                                 nr++;
2609                                 continue;
2610                         }
2611
2612                         mutex_lock(&extent_root->fs_info->pinned_mutex);
2613                         ret = pin_down_bytes(trans, extent_root, start,
2614                                              end + 1 - start, 0);
2615                         mutex_unlock(&extent_root->fs_info->pinned_mutex);
2616
2617                         ret = update_block_group(trans, extent_root, start,
2618                                                 end + 1 - start, 0, ret > 0);
2619
2620                         unlock_extent(extent_ins, start, end, GFP_NOFS);
2621                         BUG_ON(ret);
2622                         kfree(extent_op);
2623                 }
2624                 if (ret)
2625                         err = ret;
2626
2627                 search = end + 1;
2628
2629                 if (need_resched()) {
2630                         mutex_unlock(&info->extent_ins_mutex);
2631                         cond_resched();
2632                         mutex_lock(&info->extent_ins_mutex);
2633                 }
2634         }
2635
2636         if (nr) {
2637                 ret = free_extents(trans, extent_root, &delete_list);
2638                 BUG_ON(ret);
2639         }
2640
2641         if (all && skipped) {
2642                 INIT_LIST_HEAD(&delete_list);
2643                 search = 0;
2644                 nr = 0;
2645                 goto again;
2646         }
2647
2648         return err;
2649 }
2650
2651 /*
2652  * remove an extent from the root, returns 0 on success
2653  */
2654 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2655                                struct btrfs_root *root,
2656                                u64 bytenr, u64 num_bytes, u64 parent,
2657                                u64 root_objectid, u64 ref_generation,
2658                                u64 owner_objectid, int pin)
2659 {
2660         struct btrfs_root *extent_root = root->fs_info->extent_root;
2661         int pending_ret;
2662         int ret;
2663
2664         WARN_ON(num_bytes < root->sectorsize);
2665         if (root == extent_root) {
2666                 struct pending_extent_op *extent_op = NULL;
2667
2668                 mutex_lock(&root->fs_info->extent_ins_mutex);
2669                 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
2670                                 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
2671                         u64 priv;
2672                         ret = get_state_private(&root->fs_info->extent_ins,
2673                                                 bytenr, &priv);
2674                         BUG_ON(ret);
2675                         extent_op = (struct pending_extent_op *)
2676                                                 (unsigned long)priv;
2677
2678                         extent_op->del = 1;
2679                         if (extent_op->type == PENDING_EXTENT_INSERT) {
2680                                 mutex_unlock(&root->fs_info->extent_ins_mutex);
2681                                 return 0;
2682                         }
2683                 }
2684
2685                 if (extent_op) {
2686                         ref_generation = extent_op->orig_generation;
2687                         parent = extent_op->orig_parent;
2688                 }
2689
2690                 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2691                 BUG_ON(!extent_op);
2692
2693                 extent_op->type = PENDING_EXTENT_DELETE;
2694                 extent_op->bytenr = bytenr;
2695                 extent_op->num_bytes = num_bytes;
2696                 extent_op->parent = parent;
2697                 extent_op->orig_parent = parent;
2698                 extent_op->generation = ref_generation;
2699                 extent_op->orig_generation = ref_generation;
2700                 extent_op->level = (int)owner_objectid;
2701                 INIT_LIST_HEAD(&extent_op->list);
2702                 extent_op->del = 0;
2703
2704                 set_extent_bits(&root->fs_info->pending_del,
2705                                 bytenr, bytenr + num_bytes - 1,
2706                                 EXTENT_WRITEBACK, GFP_NOFS);
2707                 set_state_private(&root->fs_info->pending_del,
2708                                   bytenr, (unsigned long)extent_op);
2709                 mutex_unlock(&root->fs_info->extent_ins_mutex);
2710                 return 0;
2711         }
2712         /* if metadata always pin */
2713         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2714                 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2715                         struct btrfs_block_group_cache *cache;
2716
2717                         /* btrfs_free_reserved_extent */
2718                         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
2719                         BUG_ON(!cache);
2720                         btrfs_add_free_space(cache, bytenr, num_bytes);
2721                         update_reserved_extents(root, bytenr, num_bytes, 0);
2722                         return 0;
2723                 }
2724                 pin = 1;
2725         }
2726
2727         /* if data pin when any transaction has committed this */
2728         if (ref_generation != trans->transid)
2729                 pin = 1;
2730
2731         ret = __free_extent(trans, root, bytenr, num_bytes, parent,
2732                             root_objectid, ref_generation,
2733                             owner_objectid, pin, pin == 0);
2734
2735         finish_current_insert(trans, root->fs_info->extent_root, 0);
2736         pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
2737         return ret ? ret : pending_ret;
2738 }
2739
2740 int btrfs_free_extent(struct btrfs_trans_handle *trans,
2741                       struct btrfs_root *root,
2742                       u64 bytenr, u64 num_bytes, u64 parent,
2743                       u64 root_objectid, u64 ref_generation,
2744                       u64 owner_objectid, int pin)
2745 {
2746         int ret;
2747
2748         ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
2749                                   root_objectid, ref_generation,
2750                                   owner_objectid, pin);
2751         return ret;
2752 }
2753
2754 static u64 stripe_align(struct btrfs_root *root, u64 val)
2755 {
2756         u64 mask = ((u64)root->stripesize - 1);
2757         u64 ret = (val + mask) & ~mask;
2758         return ret;
2759 }
2760
2761 /*
2762  * walks the btree of allocated extents and find a hole of a given size.
2763  * The key ins is changed to record the hole:
2764  * ins->objectid == block start
2765  * ins->flags = BTRFS_EXTENT_ITEM_KEY
2766  * ins->offset == number of blocks
2767  * Any available blocks before search_start are skipped.
2768  */
2769 static int noinline find_free_extent(struct btrfs_trans_handle *trans,
2770                                      struct btrfs_root *orig_root,
2771                                      u64 num_bytes, u64 empty_size,
2772                                      u64 search_start, u64 search_end,
2773                                      u64 hint_byte, struct btrfs_key *ins,
2774                                      u64 exclude_start, u64 exclude_nr,
2775                                      int data)
2776 {
2777         int ret = 0;
2778         struct btrfs_root * root = orig_root->fs_info->extent_root;
2779         u64 total_needed = num_bytes;
2780         u64 *last_ptr = NULL;
2781         u64 last_wanted = 0;
2782         struct btrfs_block_group_cache *block_group = NULL;
2783         int chunk_alloc_done = 0;
2784         int empty_cluster = 2 * 1024 * 1024;
2785         int allowed_chunk_alloc = 0;
2786         struct list_head *head = NULL, *cur = NULL;
2787         int loop = 0;
2788         int extra_loop = 0;
2789         struct btrfs_space_info *space_info;
2790
2791         WARN_ON(num_bytes < root->sectorsize);
2792         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
2793         ins->objectid = 0;
2794         ins->offset = 0;
2795
2796         if (orig_root->ref_cows || empty_size)
2797                 allowed_chunk_alloc = 1;
2798
2799         if (data & BTRFS_BLOCK_GROUP_METADATA) {
2800                 last_ptr = &root->fs_info->last_alloc;
2801                 empty_cluster = 64 * 1024;
2802         }
2803
2804         if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
2805                 last_ptr = &root->fs_info->last_data_alloc;
2806
2807         if (last_ptr) {
2808                 if (*last_ptr) {
2809                         hint_byte = *last_ptr;
2810                         last_wanted = *last_ptr;
2811                 } else
2812                         empty_size += empty_cluster;
2813         } else {
2814                 empty_cluster = 0;
2815         }
2816         search_start = max(search_start, first_logical_byte(root, 0));
2817         search_start = max(search_start, hint_byte);
2818
2819         if (last_wanted && search_start != last_wanted) {
2820                 last_wanted = 0;
2821                 empty_size += empty_cluster;
2822         }
2823
2824         total_needed += empty_size;
2825         block_group = btrfs_lookup_block_group(root->fs_info, search_start);
2826         if (!block_group)
2827                 block_group = btrfs_lookup_first_block_group(root->fs_info,
2828                                                              search_start);
2829         space_info = __find_space_info(root->fs_info, data);
2830
2831         down_read(&space_info->groups_sem);
2832         while (1) {
2833                 struct btrfs_free_space *free_space;
2834                 /*
2835                  * the only way this happens if our hint points to a block
2836                  * group thats not of the proper type, while looping this
2837                  * should never happen
2838                  */
2839                 if (empty_size)
2840                         extra_loop = 1;
2841
2842                 if (!block_group)
2843                         goto new_group_no_lock;
2844
2845                 if (unlikely(!block_group->cached)) {
2846                         mutex_lock(&block_group->cache_mutex);
2847                         ret = cache_block_group(root, block_group);
2848                         mutex_unlock(&block_group->cache_mutex);
2849                         if (ret)
2850                                 break;
2851                 }
2852
2853                 mutex_lock(&block_group->alloc_mutex);
2854                 if (unlikely(!block_group_bits(block_group, data)))
2855                         goto new_group;
2856
2857                 if (unlikely(block_group->ro))
2858                         goto new_group;
2859
2860                 free_space = btrfs_find_free_space(block_group, search_start,
2861                                                    total_needed);
2862                 if (free_space) {
2863                         u64 start = block_group->key.objectid;
2864                         u64 end = block_group->key.objectid +
2865                                 block_group->key.offset;
2866
2867                         search_start = stripe_align(root, free_space->offset);
2868
2869                         /* move on to the next group */
2870                         if (search_start + num_bytes >= search_end)
2871                                 goto new_group;
2872
2873                         /* move on to the next group */
2874                         if (search_start + num_bytes > end)
2875                                 goto new_group;
2876
2877                         if (last_wanted && search_start != last_wanted) {
2878                                 total_needed += empty_cluster;
2879                                 empty_size += empty_cluster;
2880                                 last_wanted = 0;
2881                                 /*
2882                                  * if search_start is still in this block group
2883                                  * then we just re-search this block group
2884                                  */
2885                                 if (search_start >= start &&
2886                                     search_start < end) {
2887                                         mutex_unlock(&block_group->alloc_mutex);
2888                                         continue;
2889                                 }
2890
2891                                 /* else we go to the next block group */
2892                                 goto new_group;
2893                         }
2894
2895                         if (exclude_nr > 0 &&
2896                             (search_start + num_bytes > exclude_start &&
2897                              search_start < exclude_start + exclude_nr)) {
2898                                 search_start = exclude_start + exclude_nr;
2899                                 /*
2900                                  * if search_start is still in this block group
2901                                  * then we just re-search this block group
2902                                  */
2903                                 if (search_start >= start &&
2904                                     search_start < end) {
2905                                         mutex_unlock(&block_group->alloc_mutex);
2906                                         last_wanted = 0;
2907                                         continue;
2908                                 }
2909
2910                                 /* else we go to the next block group */
2911                                 goto new_group;
2912                         }
2913
2914                         ins->objectid = search_start;
2915                         ins->offset = num_bytes;
2916
2917                         btrfs_remove_free_space_lock(block_group, search_start,
2918                                                      num_bytes);
2919                         /* we are all good, lets return */
2920                         mutex_unlock(&block_group->alloc_mutex);
2921                         break;
2922                 }
2923 new_group:
2924                 mutex_unlock(&block_group->alloc_mutex);
2925 new_group_no_lock:
2926                 /* don't try to compare new allocations against the
2927                  * last allocation any more
2928                  */
2929                 last_wanted = 0;
2930
2931                 /*
2932                  * Here's how this works.
2933                  * loop == 0: we were searching a block group via a hint
2934                  *              and didn't find anything, so we start at
2935                  *              the head of the block groups and keep searching
2936                  * loop == 1: we're searching through all of the block groups
2937                  *              if we hit the head again we have searched
2938                  *              all of the block groups for this space and we
2939                  *              need to try and allocate, if we cant error out.
2940                  * loop == 2: we allocated more space and are looping through
2941                  *              all of the block groups again.
2942                  */
2943                 if (loop == 0) {
2944                         head = &space_info->block_groups;
2945                         cur = head->next;
2946                         loop++;
2947                 } else if (loop == 1 && cur == head) {
2948                         int keep_going;
2949
2950                         /* at this point we give up on the empty_size
2951                          * allocations and just try to allocate the min
2952                          * space.
2953                          *
2954                          * The extra_loop field was set if an empty_size
2955                          * allocation was attempted above, and if this
2956                          * is try we need to try the loop again without
2957                          * the additional empty_size.
2958                          */
2959                         total_needed -= empty_size;
2960                         empty_size = 0;
2961                         keep_going = extra_loop;
2962                         loop++;
2963
2964                         if (allowed_chunk_alloc && !chunk_alloc_done) {
2965                                 up_read(&space_info->groups_sem);
2966                                 ret = do_chunk_alloc(trans, root, num_bytes +
2967                                                      2 * 1024 * 1024, data, 1);
2968                                 down_read(&space_info->groups_sem);
2969                                 if (ret < 0)
2970                                         goto loop_check;
2971                                 head = &space_info->block_groups;
2972                                 /*
2973                                  * we've allocated a new chunk, keep
2974                                  * trying
2975                                  */
2976                                 keep_going = 1;
2977                                 chunk_alloc_done = 1;
2978                         } else if (!allowed_chunk_alloc) {
2979                                 space_info->force_alloc = 1;
2980                         }
2981 loop_check:
2982                         if (keep_going) {
2983                                 cur = head->next;
2984                                 extra_loop = 0;
2985                         } else {
2986                                 break;
2987                         }
2988                 } else if (cur == head) {
2989                         break;
2990                 }
2991
2992                 block_group = list_entry(cur, struct btrfs_block_group_cache,
2993                                          list);
2994                 search_start = block_group->key.objectid;
2995                 cur = cur->next;
2996         }
2997
2998         /* we found what we needed */
2999         if (ins->objectid) {
3000                 if (!(data & BTRFS_BLOCK_GROUP_DATA))
3001                         trans->block_group = block_group;
3002
3003                 if (last_ptr)
3004                         *last_ptr = ins->objectid + ins->offset;
3005                 ret = 0;
3006         } else if (!ret) {
3007                 printk(KERN_ERR "we were searching for %Lu bytes, num_bytes %Lu,"
3008                        " loop %d, allowed_alloc %d\n", total_needed, num_bytes,
3009                        loop, allowed_chunk_alloc);
3010                 ret = -ENOSPC;
3011         }
3012
3013         up_read(&space_info->groups_sem);
3014         return ret;
3015 }
3016
3017 static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3018 {
3019         struct btrfs_block_group_cache *cache;
3020         struct list_head *l;
3021
3022         printk(KERN_INFO "space_info has %Lu free, is %sfull\n",
3023                info->total_bytes - info->bytes_used - info->bytes_pinned -
3024                info->bytes_reserved, (info->full) ? "" : "not ");
3025
3026         down_read(&info->groups_sem);
3027         list_for_each(l, &info->block_groups) {
3028                 cache = list_entry(l, struct btrfs_block_group_cache, list);
3029                 spin_lock(&cache->lock);
3030                 printk(KERN_INFO "block group %Lu has %Lu bytes, %Lu used "
3031                        "%Lu pinned %Lu reserved\n",
3032                        cache->key.objectid, cache->key.offset,
3033                        btrfs_block_group_used(&cache->item),
3034                        cache->pinned, cache->reserved);
3035                 btrfs_dump_free_space(cache, bytes);
3036                 spin_unlock(&cache->lock);
3037         }
3038         up_read(&info->groups_sem);
3039 }
3040
3041 static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3042                                   struct btrfs_root *root,
3043                                   u64 num_bytes, u64 min_alloc_size,
3044                                   u64 empty_size, u64 hint_byte,
3045                                   u64 search_end, struct btrfs_key *ins,
3046                                   u64 data)
3047 {
3048         int ret;
3049         u64 search_start = 0;
3050         u64 alloc_profile;
3051         struct btrfs_fs_info *info = root->fs_info;
3052
3053         if (data) {
3054                 alloc_profile = info->avail_data_alloc_bits &
3055                                 info->data_alloc_profile;
3056                 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
3057         } else if (root == root->fs_info->chunk_root) {
3058                 alloc_profile = info->avail_system_alloc_bits &
3059                                 info->system_alloc_profile;
3060                 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
3061         } else {
3062                 alloc_profile = info->avail_metadata_alloc_bits &
3063                                 info->metadata_alloc_profile;
3064                 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
3065         }
3066 again:
3067         data = btrfs_reduce_alloc_profile(root, data);
3068         /*
3069          * the only place that sets empty_size is btrfs_realloc_node, which
3070          * is not called recursively on allocations
3071          */
3072         if (empty_size || root->ref_cows) {
3073                 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
3074                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3075                                      2 * 1024 * 1024,
3076                                      BTRFS_BLOCK_GROUP_METADATA |
3077                                      (info->metadata_alloc_profile &
3078                                       info->avail_metadata_alloc_bits), 0);
3079                 }
3080                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3081                                      num_bytes + 2 * 1024 * 1024, data, 0);
3082         }
3083
3084         WARN_ON(num_bytes < root->sectorsize);
3085         ret = find_free_extent(trans, root, num_bytes, empty_size,
3086                                search_start, search_end, hint_byte, ins,
3087                                trans->alloc_exclude_start,
3088                                trans->alloc_exclude_nr, data);
3089
3090         if (ret == -ENOSPC && num_bytes > min_alloc_size) {
3091                 num_bytes = num_bytes >> 1;
3092                 num_bytes = num_bytes & ~(root->sectorsize - 1);
3093                 num_bytes = max(num_bytes, min_alloc_size);
3094                 do_chunk_alloc(trans, root->fs_info->extent_root,
3095                                num_bytes, data, 1);
3096                 goto again;
3097         }
3098         if (ret) {
3099                 struct btrfs_space_info *sinfo;
3100
3101                 sinfo = __find_space_info(root->fs_info, data);
3102                 printk("allocation failed flags %Lu, wanted %Lu\n",
3103                        data, num_bytes);
3104                 dump_space_info(sinfo, num_bytes);
3105                 BUG();
3106         }
3107
3108         return ret;
3109 }
3110
3111 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
3112 {
3113         struct btrfs_block_group_cache *cache;
3114
3115         cache = btrfs_lookup_block_group(root->fs_info, start);
3116         if (!cache) {
3117                 printk(KERN_ERR "Unable to find block group for %Lu\n", start);
3118                 return -ENOSPC;
3119         }
3120         btrfs_add_free_space(cache, start, len);
3121         update_reserved_extents(root, start, len, 0);
3122         return 0;
3123 }
3124
3125 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3126                                   struct btrfs_root *root,
3127                                   u64 num_bytes, u64 min_alloc_size,
3128                                   u64 empty_size, u64 hint_byte,
3129                                   u64 search_end, struct btrfs_key *ins,
3130                                   u64 data)
3131 {
3132         int ret;
3133         ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
3134                                      empty_size, hint_byte, search_end, ins,
3135                                      data);
3136         update_reserved_extents(root, ins->objectid, ins->offset, 1);
3137         return ret;
3138 }
3139
3140 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3141                                          struct btrfs_root *root, u64 parent,
3142                                          u64 root_objectid, u64 ref_generation,
3143                                          u64 owner, struct btrfs_key *ins)
3144 {
3145         int ret;
3146         int pending_ret;
3147         u64 super_used;
3148         u64 root_used;
3149         u64 num_bytes = ins->offset;
3150         u32 sizes[2];
3151         struct btrfs_fs_info *info = root->fs_info;
3152         struct btrfs_root *extent_root = info->extent_root;
3153         struct btrfs_extent_item *extent_item;
3154         struct btrfs_extent_ref *ref;
3155         struct btrfs_path *path;
3156         struct btrfs_key keys[2];
3157
3158         if (parent == 0)
3159                 parent = ins->objectid;
3160
3161         /* block accounting for super block */
3162         spin_lock_irq(&info->delalloc_lock);
3163         super_used = btrfs_super_bytes_used(&info->super_copy);
3164         btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
3165         spin_unlock_irq(&info->delalloc_lock);
3166
3167         /* block accounting for root item */
3168         root_used = btrfs_root_used(&root->root_item);
3169         btrfs_set_root_used(&root->root_item, root_used + num_bytes);
3170
3171         if (root == extent_root) {
3172                 struct pending_extent_op *extent_op;
3173
3174                 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3175                 BUG_ON(!extent_op);
3176
3177                 extent_op->type = PENDING_EXTENT_INSERT;
3178                 extent_op->bytenr = ins->objectid;
3179                 extent_op->num_bytes = ins->offset;
3180                 extent_op->parent = parent;
3181                 extent_op->orig_parent = 0;
3182                 extent_op->generation = ref_generation;
3183                 extent_op->orig_generation = 0;
3184                 extent_op->level = (int)owner;
3185                 INIT_LIST_HEAD(&extent_op->list);
3186                 extent_op->del = 0;
3187
3188                 mutex_lock(&root->fs_info->extent_ins_mutex);
3189                 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
3190                                 ins->objectid + ins->offset - 1,
3191                                 EXTENT_WRITEBACK, GFP_NOFS);
3192                 set_state_private(&root->fs_info->extent_ins,
3193                                   ins->objectid, (unsigned long)extent_op);
3194                 mutex_unlock(&root->fs_info->extent_ins_mutex);
3195                 goto update_block;
3196         }
3197
3198         memcpy(&keys[0], ins, sizeof(*ins));
3199         keys[1].objectid = ins->objectid;
3200         keys[1].type = BTRFS_EXTENT_REF_KEY;
3201         keys[1].offset = parent;
3202         sizes[0] = sizeof(*extent_item);
3203         sizes[1] = sizeof(*ref);
3204
3205         path = btrfs_alloc_path();
3206         BUG_ON(!path);
3207
3208         ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
3209                                        sizes, 2);
3210         BUG_ON(ret);
3211
3212         extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3213                                      struct btrfs_extent_item);
3214         btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
3215         ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3216                              struct btrfs_extent_ref);
3217
3218         btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
3219         btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
3220         btrfs_set_ref_objectid(path->nodes[0], ref, owner);
3221         btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
3222
3223         btrfs_mark_buffer_dirty(path->nodes[0]);
3224
3225         trans->alloc_exclude_start = 0;
3226         trans->alloc_exclude_nr = 0;
3227         btrfs_free_path(path);
3228         finish_current_insert(trans, extent_root, 0);
3229         pending_ret = del_pending_extents(trans, extent_root, 0);
3230
3231         if (ret)
3232                 goto out;
3233         if (pending_ret) {
3234                 ret = pending_ret;
3235                 goto out;
3236         }
3237
3238 update_block:
3239         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0);
3240         if (ret) {
3241                 printk("update block group failed for %Lu %Lu\n",
3242                        ins->objectid, ins->offset);
3243                 BUG();
3244         }
3245 out:
3246         return ret;
3247 }
3248
3249 int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3250                                 struct btrfs_root *root, u64 parent,
3251                                 u64 root_objectid, u64 ref_generation,
3252                                 u64 owner, struct btrfs_key *ins)
3253 {
3254         int ret;
3255
3256         if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
3257                 return 0;
3258         ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3259                                             ref_generation, owner, ins);
3260         update_reserved_extents(root, ins->objectid, ins->offset, 0);
3261         return ret;
3262 }
3263
3264 /*
3265  * this is used by the tree logging recovery code.  It records that
3266  * an extent has been allocated and makes sure to clear the free
3267  * space cache bits as well
3268  */
3269 int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3270                                 struct btrfs_root *root, u64 parent,
3271                                 u64 root_objectid, u64 ref_generation,
3272                                 u64 owner, struct btrfs_key *ins)
3273 {
3274         int ret;
3275         struct btrfs_block_group_cache *block_group;
3276
3277         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
3278         mutex_lock(&block_group->cache_mutex);
3279         cache_block_group(root, block_group);
3280         mutex_unlock(&block_group->cache_mutex);
3281
3282         ret = btrfs_remove_free_space(block_group, ins->objectid,
3283                                       ins->offset);
3284         BUG_ON(ret);
3285         ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3286                                             ref_generation, owner, ins);
3287         return ret;
3288 }
3289
3290 /*
3291  * finds a free extent and does all the dirty work required for allocation
3292  * returns the key for the extent through ins, and a tree buffer for
3293  * the first block of the extent through buf.
3294  *
3295  * returns 0 if everything worked, non-zero otherwise.
3296  */
3297 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3298                        struct btrfs_root *root,
3299                        u64 num_bytes, u64 parent, u64 min_alloc_size,
3300                        u64 root_objectid, u64 ref_generation,
3301                        u64 owner_objectid, u64 empty_size, u64 hint_byte,
3302                        u64 search_end, struct btrfs_key *ins, u64 data)
3303 {
3304         int ret;
3305
3306         ret = __btrfs_reserve_extent(trans, root, num_bytes,
3307                                      min_alloc_size, empty_size, hint_byte,
3308                                      search_end, ins, data);
3309         BUG_ON(ret);
3310         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3311                 ret = __btrfs_alloc_reserved_extent(trans, root, parent,
3312                                         root_objectid, ref_generation,
3313                                         owner_objectid, ins);
3314                 BUG_ON(ret);
3315
3316         } else {
3317                 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3318         }
3319         return ret;
3320 }
3321
3322 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3323                                             struct btrfs_root *root,
3324                                             u64 bytenr, u32 blocksize)
3325 {
3326         struct extent_buffer *buf;
3327
3328         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
3329         if (!buf)
3330                 return ERR_PTR(-ENOMEM);
3331         btrfs_set_header_generation(buf, trans->transid);
3332         btrfs_tree_lock(buf);
3333         clean_tree_block(trans, root, buf);
3334         btrfs_set_buffer_uptodate(buf);
3335         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3336                 set_extent_dirty(&root->dirty_log_pages, buf->start,
3337                          buf->start + buf->len - 1, GFP_NOFS);
3338         } else {
3339                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
3340                          buf->start + buf->len - 1, GFP_NOFS);
3341         }
3342         trans->blocks_used++;
3343         return buf;
3344 }
3345
3346 /*
3347  * helper function to allocate a block for a given tree
3348  * returns the tree buffer or NULL.
3349  */
3350 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3351                                              struct btrfs_root *root,
3352                                              u32 blocksize, u64 parent,
3353                                              u64 root_objectid,
3354                                              u64 ref_generation,
3355                                              int level,
3356                                              u64 hint,
3357                                              u64 empty_size)
3358 {
3359         struct btrfs_key ins;
3360         int ret;
3361         struct extent_buffer *buf;
3362
3363         ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
3364                                  root_objectid, ref_generation, level,
3365                                  empty_size, hint, (u64)-1, &ins, 0);
3366         if (ret) {
3367                 BUG_ON(ret > 0);
3368                 return ERR_PTR(ret);
3369         }
3370
3371         buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
3372         return buf;
3373 }
3374
3375 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3376                         struct btrfs_root *root, struct extent_buffer *leaf)
3377 {
3378         u64 leaf_owner;
3379         u64 leaf_generation;
3380         struct btrfs_key key;
3381         struct btrfs_file_extent_item *fi;
3382         int i;
3383         int nritems;
3384         int ret;
3385
3386         BUG_ON(!btrfs_is_leaf(leaf));
3387         nritems = btrfs_header_nritems(leaf);
3388         leaf_owner = btrfs_header_owner(leaf);
3389         leaf_generation = btrfs_header_generation(leaf);
3390
3391         for (i = 0; i < nritems; i++) {
3392                 u64 disk_bytenr;
3393                 cond_resched();
3394
3395                 btrfs_item_key_to_cpu(leaf, &key, i);
3396                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3397                         continue;
3398                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3399                 if (btrfs_file_extent_type(leaf, fi) ==
3400                     BTRFS_FILE_EXTENT_INLINE)
3401                         continue;
3402                 /*
3403                  * FIXME make sure to insert a trans record that
3404                  * repeats the snapshot del on crash
3405                  */
3406                 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3407                 if (disk_bytenr == 0)
3408                         continue;
3409
3410                 ret = __btrfs_free_extent(trans, root, disk_bytenr,
3411                                 btrfs_file_extent_disk_num_bytes(leaf, fi),
3412                                 leaf->start, leaf_owner, leaf_generation,
3413                                 key.objectid, 0);
3414                 BUG_ON(ret);
3415
3416                 atomic_inc(&root->fs_info->throttle_gen);
3417                 wake_up(&root->fs_info->transaction_throttle);
3418                 cond_resched();
3419         }
3420         return 0;
3421 }
3422
3423 static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3424                                         struct btrfs_root *root,
3425                                         struct btrfs_leaf_ref *ref)
3426 {
3427         int i;
3428         int ret;
3429         struct btrfs_extent_info *info = ref->extents;
3430
3431         for (i = 0; i < ref->nritems; i++) {
3432                 ret = __btrfs_free_extent(trans, root, info->bytenr,
3433                                           info->num_bytes, ref->bytenr,
3434                                           ref->owner, ref->generation,
3435                                           info->objectid, 0);
3436
3437                 atomic_inc(&root->fs_info->throttle_gen);
3438                 wake_up(&root->fs_info->transaction_throttle);
3439                 cond_resched();
3440
3441                 BUG_ON(ret);
3442                 info++;
3443         }
3444
3445         return 0;
3446 }
3447
3448 static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
3449                               u32 *refs)
3450 {
3451         int ret;
3452
3453         ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
3454         BUG_ON(ret);
3455
3456 #if 0 // some debugging code in case we see problems here
3457         /* if the refs count is one, it won't get increased again.  But
3458          * if the ref count is > 1, someone may be decreasing it at
3459          * the same time we are.
3460          */
3461         if (*refs != 1) {
3462                 struct extent_buffer *eb = NULL;
3463                 eb = btrfs_find_create_tree_block(root, start, len);
3464                 if (eb)
3465                         btrfs_tree_lock(eb);
3466
3467                 mutex_lock(&root->fs_info->alloc_mutex);
3468                 ret = lookup_extent_ref(NULL, root, start, len, refs);
3469                 BUG_ON(ret);
3470                 mutex_unlock(&root->fs_info->alloc_mutex);
3471
3472                 if (eb) {
3473                         btrfs_tree_unlock(eb);
3474                         free_extent_buffer(eb);
3475                 }
3476                 if (*refs == 1) {
3477                         printk("block %llu went down to one during drop_snap\n",
3478                                (unsigned long long)start);
3479                 }
3480
3481         }
3482 #endif
3483
3484         cond_resched();
3485         return ret;
3486 }
3487
3488 /*
3489  * helper function for drop_snapshot, this walks down the tree dropping ref
3490  * counts as it goes.
3491  */
3492 static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
3493                                    struct btrfs_root *root,
3494                                    struct btrfs_path *path, int *level)
3495 {
3496         u64 root_owner;
3497         u64 root_gen;
3498         u64 bytenr;
3499         u64 ptr_gen;
3500         struct extent_buffer *next;
3501         struct extent_buffer *cur;
3502         struct extent_buffer *parent;
3503         struct btrfs_leaf_ref *ref;
3504         u32 blocksize;
3505         int ret;
3506         u32 refs;
3507
3508         WARN_ON(*level < 0);
3509         WARN_ON(*level >= BTRFS_MAX_LEVEL);
3510         ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
3511                                 path->nodes[*level]->len, &refs);
3512         BUG_ON(ret);
3513         if (refs > 1)
3514                 goto out;
3515
3516         /*
3517          * walk down to the last node level and free all the leaves
3518          */
3519         while(*level >= 0) {
3520                 WARN_ON(*level < 0);
3521                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3522                 cur = path->nodes[*level];
3523
3524                 if (btrfs_header_level(cur) != *level)
3525                         WARN_ON(1);
3526
3527                 if (path->slots[*level] >=
3528                     btrfs_header_nritems(cur))
3529                         break;
3530                 if (*level == 0) {
3531                         ret = btrfs_drop_leaf_ref(trans, root, cur);
3532                         BUG_ON(ret);
3533                         break;
3534                 }
3535                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3536                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3537                 blocksize = btrfs_level_size(root, *level - 1);
3538
3539                 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3540                 BUG_ON(ret);
3541                 if (refs != 1) {
3542                         parent = path->nodes[*level];
3543                         root_owner = btrfs_header_owner(parent);
3544                         root_gen = btrfs_header_generation(parent);
3545                         path->slots[*level]++;
3546
3547                         ret = __btrfs_free_extent(trans, root, bytenr,
3548                                                 blocksize, parent->start,
3549                                                 root_owner, root_gen,
3550                                                 *level - 1, 1);
3551                         BUG_ON(ret);
3552
3553                         atomic_inc(&root->fs_info->throttle_gen);
3554                         wake_up(&root->fs_info->transaction_throttle);
3555                         cond_resched();
3556
3557                         continue;
3558                 }
3559                 /*
3560                  * at this point, we have a single ref, and since the
3561                  * only place referencing this extent is a dead root
3562                  * the reference count should never go higher.
3563                  * So, we don't need to check it again
3564                  */
3565                 if (*level == 1) {
3566                         ref = btrfs_lookup_leaf_ref(root, bytenr);
3567                         if (ref && ref->generation != ptr_gen) {
3568                                 btrfs_free_leaf_ref(root, ref);
3569                                 ref = NULL;
3570                         }
3571                         if (ref) {
3572                                 ret = cache_drop_leaf_ref(trans, root, ref);
3573                                 BUG_ON(ret);
3574                                 btrfs_remove_leaf_ref(root, ref);
3575                                 btrfs_free_leaf_ref(root, ref);
3576                                 *level = 0;
3577                                 break;
3578                         }
3579                         if (printk_ratelimit()) {
3580                                 printk("leaf ref miss for bytenr %llu\n",
3581                                        (unsigned long long)bytenr);
3582                         }
3583                 }
3584                 next = btrfs_find_tree_block(root, bytenr, blocksize);
3585                 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
3586                         free_extent_buffer(next);
3587
3588                         next = read_tree_block(root, bytenr, blocksize,
3589                                                ptr_gen);
3590                         cond_resched();
3591 #if 0
3592                         /*
3593                          * this is a debugging check and can go away
3594                          * the ref should never go all the way down to 1
3595                          * at this point
3596                          */
3597                         ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
3598                                                 &refs);
3599                         BUG_ON(ret);
3600                         WARN_ON(refs != 1);
3601 #endif
3602                 }
3603                 WARN_ON(*level <= 0);
3604                 if (path->nodes[*level-1])
3605                         free_extent_buffer(path->nodes[*level-1]);
3606                 path->nodes[*level-1] = next;
3607                 *level = btrfs_header_level(next);
3608                 path->slots[*level] = 0;
3609                 cond_resched();
3610         }
3611 out:
3612         WARN_ON(*level < 0);
3613         WARN_ON(*level >= BTRFS_MAX_LEVEL);
3614
3615         if (path->nodes[*level] == root->node) {
3616                 parent = path->nodes[*level];
3617                 bytenr = path->nodes[*level]->start;
3618         } else {
3619                 parent = path->nodes[*level + 1];
3620                 bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
3621         }
3622
3623         blocksize = btrfs_level_size(root, *level);
3624         root_owner = btrfs_header_owner(parent);
3625         root_gen = btrfs_header_generation(parent);
3626
3627         ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3628                                   parent->start, root_owner, root_gen,
3629                                   *level, 1);
3630         free_extent_buffer(path->nodes[*level]);
3631         path->nodes[*level] = NULL;
3632         *level += 1;
3633         BUG_ON(ret);
3634
3635         cond_resched();
3636         return 0;
3637 }
3638
3639 /*
3640  * helper function for drop_subtree, this function is similar to
3641  * walk_down_tree. The main difference is that it checks reference
3642  * counts while tree blocks are locked.
3643  */
3644 static int noinline walk_down_subtree(struct btrfs_trans_handle *trans,
3645                                       struct btrfs_root *root,
3646                                       struct btrfs_path *path, int *level)
3647 {
3648         struct extent_buffer *next;
3649         struct extent_buffer *cur;
3650         struct extent_buffer *parent;
3651         u64 bytenr;
3652         u64 ptr_gen;
3653         u32 blocksize;
3654         u32 refs;
3655         int ret;
3656
3657         cur = path->nodes[*level];
3658         ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
3659                                       &refs);
3660         BUG_ON(ret);
3661         if (refs > 1)
3662                 goto out;
3663
3664         while (*level >= 0) {
3665                 cur = path->nodes[*level];
3666                 if (*level == 0) {
3667                         ret = btrfs_drop_leaf_ref(trans, root, cur);
3668                         BUG_ON(ret);
3669                         clean_tree_block(trans, root, cur);
3670                         break;
3671                 }
3672                 if (path->slots[*level] >= btrfs_header_nritems(cur)) {
3673                         clean_tree_block(trans, root, cur);
3674                         break;
3675                 }
3676
3677                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3678                 blocksize = btrfs_level_size(root, *level - 1);
3679                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3680
3681                 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3682                 btrfs_tree_lock(next);
3683
3684                 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3685                                               &refs);
3686                 BUG_ON(ret);
3687                 if (refs > 1) {
3688                         parent = path->nodes[*level];
3689                         ret = btrfs_free_extent(trans, root, bytenr,
3690                                         blocksize, parent->start,
3691                                         btrfs_header_owner(parent),
3692                                         btrfs_header_generation(parent),
3693                                         *level - 1, 1);
3694                         BUG_ON(ret);
3695                         path->slots[*level]++;
3696                         btrfs_tree_unlock(next);
3697                         free_extent_buffer(next);
3698                         continue;
3699                 }
3700
3701                 *level = btrfs_header_level(next);
3702                 path->nodes[*level] = next;
3703                 path->slots[*level] = 0;
3704                 path->locks[*level] = 1;
3705                 cond_resched();
3706         }
3707 out:
3708         parent = path->nodes[*level + 1];
3709         bytenr = path->nodes[*level]->start;
3710         blocksize = path->nodes[*level]->len;
3711
3712         ret = btrfs_free_extent(trans, root, bytenr, blocksize,
3713                         parent->start, btrfs_header_owner(parent),
3714                         btrfs_header_generation(parent), *level, 1);
3715         BUG_ON(ret);
3716
3717         if (path->locks[*level]) {
3718                 btrfs_tree_unlock(path->nodes[*level]);
3719                 path->locks[*level] = 0;
3720         }
3721         free_extent_buffer(path->nodes[*level]);
3722         path->nodes[*level] = NULL;
3723         *level += 1;
3724         cond_resched();
3725         return 0;
3726 }
3727
3728 /*
3729  * helper for dropping snapshots.  This walks back up the tree in the path
3730  * to find the first node higher up where we haven't yet gone through
3731  * all the slots
3732  */
3733 static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
3734                                  struct btrfs_root *root,
3735                                  struct btrfs_path *path,
3736                                  int *level, int max_level)
3737 {
3738         u64 root_owner;
3739         u64 root_gen;
3740         struct btrfs_root_item *root_item = &root->root_item;
3741         int i;
3742         int slot;
3743         int ret;
3744
3745         for (i = *level; i < max_level && path->nodes[i]; i++) {
3746                 slot = path->slots[i];
3747                 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3748                         struct extent_buffer *node;
3749                         struct btrfs_disk_key disk_key;
3750                         node = path->nodes[i];
3751                         path->slots[i]++;
3752                         *level = i;
3753                         WARN_ON(*level == 0);
3754                         btrfs_node_key(node, &disk_key, path->slots[i]);
3755                         memcpy(&root_item->drop_progress,
3756                                &disk_key, sizeof(disk_key));
3757                         root_item->drop_level = i;
3758                         return 0;
3759                 } else {
3760                         struct extent_buffer *parent;
3761                         if (path->nodes[*level] == root->node)
3762                                 parent = path->nodes[*level];
3763                         else
3764                                 parent = path->nodes[*level + 1];
3765
3766                         root_owner = btrfs_header_owner(parent);
3767                         root_gen = btrfs_header_generation(parent);
3768
3769                         clean_tree_block(trans, root, path->nodes[*level]);
3770                         ret = btrfs_free_extent(trans, root,
3771                                                 path->nodes[*level]->start,
3772                                                 path->nodes[*level]->len,
3773                                                 parent->start, root_owner,
3774                                                 root_gen, *level, 1);
3775                         BUG_ON(ret);
3776                         if (path->locks[*level]) {
3777                                 btrfs_tree_unlock(path->nodes[*level]);
3778                                 path->locks[*level] = 0;
3779                         }
3780                         free_extent_buffer(path->nodes[*level]);
3781                         path->nodes[*level] = NULL;
3782                         *level = i + 1;
3783                 }
3784         }
3785         return 1;
3786 }
3787
3788 /*
3789  * drop the reference count on the tree rooted at 'snap'.  This traverses
3790  * the tree freeing any blocks that have a ref count of zero after being
3791  * decremented.
3792  */
3793 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
3794                         *root)
3795 {
3796         int ret = 0;
3797         int wret;
3798         int level;
3799         struct btrfs_path *path;
3800         int i;
3801         int orig_level;
3802         struct btrfs_root_item *root_item = &root->root_item;
3803
3804         WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
3805         path = btrfs_alloc_path();
3806         BUG_ON(!path);
3807
3808         level = btrfs_header_level(root->node);
3809         orig_level = level;
3810         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
3811                 path->nodes[level] = root->node;
3812                 extent_buffer_get(root->node);
3813                 path->slots[level] = 0;
3814         } else {
3815                 struct btrfs_key key;
3816                 struct btrfs_disk_key found_key;
3817                 struct extent_buffer *node;
3818
3819                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
3820                 level = root_item->drop_level;
3821                 path->lowest_level = level;
3822                 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3823                 if (wret < 0) {
3824                         ret = wret;
3825                         goto out;
3826                 }
3827                 node = path->nodes[level];
3828                 btrfs_node_key(node, &found_key, path->slots[level]);
3829                 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
3830                                sizeof(found_key)));
3831                 /*
3832                  * unlock our path, this is safe because only this
3833                  * function is allowed to delete this snapshot
3834                  */
3835                 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3836                         if (path->nodes[i] && path->locks[i]) {
3837                                 path->locks[i] = 0;
3838                                 btrfs_tree_unlock(path->nodes[i]);
3839                         }
3840                 }
3841         }
3842         while(1) {
3843                 wret = walk_down_tree(trans, root, path, &level);
3844                 if (wret > 0)
3845                         break;
3846                 if (wret < 0)
3847                         ret = wret;
3848
3849                 wret = walk_up_tree(trans, root, path, &level,
3850                                     BTRFS_MAX_LEVEL);
3851                 if (wret > 0)
3852                         break;
3853                 if (wret < 0)
3854                         ret = wret;
3855                 if (trans->transaction->in_commit) {
3856                         ret = -EAGAIN;
3857                         break;
3858                 }
3859                 atomic_inc(&root->fs_info->throttle_gen);
3860                 wake_up(&root->fs_info->transaction_throttle);
3861         }
3862         for (i = 0; i <= orig_level; i++) {
3863                 if (path->nodes[i]) {
3864                         free_extent_buffer(path->nodes[i]);
3865                         path->nodes[i] = NULL;
3866                 }
3867         }
3868 out:
3869         btrfs_free_path(path);
3870         return ret;
3871 }
3872
3873 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3874                         struct btrfs_root *root,
3875                         struct extent_buffer *node,
3876                         struct extent_buffer *parent)
3877 {
3878         struct btrfs_path *path;
3879         int level;
3880         int parent_level;
3881         int ret = 0;
3882         int wret;
3883
3884         path = btrfs_alloc_path();
3885         BUG_ON(!path);
3886
3887         BUG_ON(!btrfs_tree_locked(parent));
3888         parent_level = btrfs_header_level(parent);
3889         extent_buffer_get(parent);
3890         path->nodes[parent_level] = parent;
3891         path->slots[parent_level] = btrfs_header_nritems(parent);
3892
3893         BUG_ON(!btrfs_tree_locked(node));
3894         level = btrfs_header_level(node);
3895         extent_buffer_get(node);
3896         path->nodes[level] = node;
3897         path->slots[level] = 0;
3898
3899         while (1) {
3900                 wret = walk_down_subtree(trans, root, path, &level);
3901                 if (wret < 0)
3902                         ret = wret;
3903                 if (wret != 0)
3904                         break;
3905
3906                 wret = walk_up_tree(trans, root, path, &level, parent_level);
3907                 if (wret < 0)
3908                         ret = wret;
3909                 if (wret != 0)
3910                         break;
3911         }
3912
3913         btrfs_free_path(path);
3914         return ret;
3915 }
3916
3917 static unsigned long calc_ra(unsigned long start, unsigned long last,
3918                              unsigned long nr)
3919 {
3920         return min(last, start + nr - 1);
3921 }
3922
3923 static int noinline relocate_inode_pages(struct inode *inode, u64 start,
3924                                          u64 len)
3925 {
3926         u64 page_start;
3927         u64 page_end;
3928         unsigned long first_index;
3929         unsigned long last_index;
3930         unsigned long i;
3931         struct page *page;
3932         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3933         struct file_ra_state *ra;
3934         struct btrfs_ordered_extent *ordered;
3935         unsigned int total_read = 0;
3936         unsigned int total_dirty = 0;
3937         int ret = 0;
3938
3939         ra = kzalloc(sizeof(*ra), GFP_NOFS);
3940
3941         mutex_lock(&inode->i_mutex);
3942         first_index = start >> PAGE_CACHE_SHIFT;
3943         last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
3944
3945         /* make sure the dirty trick played by the caller work */
3946         ret = invalidate_inode_pages2_range(inode->i_mapping,
3947                                             first_index, last_index);
3948         if (ret)
3949                 goto out_unlock;
3950
3951         file_ra_state_init(ra, inode->i_mapping);
3952
3953         for (i = first_index ; i <= last_index; i++) {
3954                 if (total_read % ra->ra_pages == 0) {
3955                         btrfs_force_ra(inode->i_mapping, ra, NULL, i,
3956                                        calc_ra(i, last_index, ra->ra_pages));
3957                 }
3958                 total_read++;
3959 again:
3960                 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
3961                         BUG_ON(1);
3962                 page = grab_cache_page(inode->i_mapping, i);
3963                 if (!page) {
3964                         ret = -ENOMEM;
3965                         goto out_unlock;
3966                 }
3967                 if (!PageUptodate(page)) {
3968                         btrfs_readpage(NULL, page);
3969                         lock_page(page);
3970                         if (!PageUptodate(page)) {
3971                                 unlock_page(page);
3972                                 page_cache_release(page);
3973                                 ret = -EIO;
3974                                 goto out_unlock;
3975                         }
3976                 }
3977                 wait_on_page_writeback(page);
3978
3979                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
3980                 page_end = page_start + PAGE_CACHE_SIZE - 1;
3981                 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
3982
3983                 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3984                 if (ordered) {
3985                         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3986                         unlock_page(page);
3987                         page_cache_release(page);
3988                         btrfs_start_ordered_extent(inode, ordered, 1);
3989                         btrfs_put_ordered_extent(ordered);
3990                         goto again;
3991                 }
3992                 set_page_extent_mapped(page);
3993
3994                 btrfs_set_extent_delalloc(inode, page_start, page_end);
3995                 if (i == first_index)
3996                         set_extent_bits(io_tree, page_start, page_end,
3997                                         EXTENT_BOUNDARY, GFP_NOFS);
3998
3999                 set_page_dirty(page);
4000                 total_dirty++;
4001
4002                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4003                 unlock_page(page);
4004                 page_cache_release(page);
4005         }
4006
4007 out_unlock:
4008         kfree(ra);
4009         mutex_unlock(&inode->i_mutex);
4010         balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
4011         return ret;
4012 }
4013
4014 static int noinline relocate_data_extent(struct inode *reloc_inode,
4015                                          struct btrfs_key *extent_key,
4016                                          u64 offset)
4017 {
4018         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4019         struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
4020         struct extent_map *em;
4021         u64 start = extent_key->objectid - offset;
4022         u64 end = start + extent_key->offset - 1;
4023
4024         em = alloc_extent_map(GFP_NOFS);
4025         BUG_ON(!em || IS_ERR(em));
4026
4027         em->start = start;
4028         em->len = extent_key->offset;
4029         em->block_len = extent_key->offset;
4030         em->block_start = extent_key->objectid;
4031         em->bdev = root->fs_info->fs_devices->latest_bdev;
4032         set_bit(EXTENT_FLAG_PINNED, &em->flags);
4033
4034         /* setup extent map to cheat btrfs_readpage */
4035         lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4036         while (1) {
4037                 int ret;
4038                 spin_lock(&em_tree->lock);
4039                 ret = add_extent_mapping(em_tree, em);
4040                 spin_unlock(&em_tree->lock);
4041                 if (ret != -EEXIST) {
4042                         free_extent_map(em);
4043                         break;
4044                 }
4045                 btrfs_drop_extent_cache(reloc_inode, start, end, 0);
4046         }
4047         unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4048
4049         return relocate_inode_pages(reloc_inode, start, extent_key->offset);
4050 }
4051
4052 struct btrfs_ref_path {
4053         u64 extent_start;
4054         u64 nodes[BTRFS_MAX_LEVEL];
4055         u64 root_objectid;
4056         u64 root_generation;
4057         u64 owner_objectid;
4058         u32 num_refs;
4059         int lowest_level;
4060         int current_level;
4061         int shared_level;
4062
4063         struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
4064         u64 new_nodes[BTRFS_MAX_LEVEL];
4065 };
4066
4067 struct disk_extent {
4068         u64 ram_bytes;
4069         u64 disk_bytenr;
4070         u64 disk_num_bytes;
4071         u64 offset;
4072         u64 num_bytes;
4073         u8 compression;
4074         u8 encryption;
4075         u16 other_encoding;
4076 };
4077
4078 static int is_cowonly_root(u64 root_objectid)
4079 {
4080         if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
4081             root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
4082             root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
4083             root_objectid == BTRFS_DEV_TREE_OBJECTID ||
4084             root_objectid == BTRFS_TREE_LOG_OBJECTID)
4085                 return 1;
4086         return 0;
4087 }
4088
4089 static int noinline __next_ref_path(struct btrfs_trans_handle *trans,
4090                                     struct btrfs_root *extent_root,
4091                                     struct btrfs_ref_path *ref_path,
4092                                     int first_time)
4093 {
4094         struct extent_buffer *leaf;
4095         struct btrfs_path *path;
4096         struct btrfs_extent_ref *ref;
4097         struct btrfs_key key;
4098         struct btrfs_key found_key;
4099         u64 bytenr;
4100         u32 nritems;
4101         int level;
4102         int ret = 1;
4103
4104         path = btrfs_alloc_path();
4105         if (!path)
4106                 return -ENOMEM;
4107
4108         if (first_time) {
4109                 ref_path->lowest_level = -1;
4110                 ref_path->current_level = -1;
4111                 ref_path->shared_level = -1;
4112                 goto walk_up;
4113         }
4114 walk_down:
4115         level = ref_path->current_level - 1;
4116         while (level >= -1) {
4117                 u64 parent;
4118                 if (level < ref_path->lowest_level)
4119                         break;
4120
4121                 if (level >= 0) {
4122                         bytenr = ref_path->nodes[level];
4123                 } else {
4124                         bytenr = ref_path->extent_start;
4125                 }
4126                 BUG_ON(bytenr == 0);
4127
4128                 parent = ref_path->nodes[level + 1];
4129                 ref_path->nodes[level + 1] = 0;
4130                 ref_path->current_level = level;
4131                 BUG_ON(parent == 0);
4132
4133                 key.objectid = bytenr;
4134                 key.offset = parent + 1;
4135                 key.type = BTRFS_EXTENT_REF_KEY;
4136
4137                 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4138                 if (ret < 0)
4139                         goto out;
4140                 BUG_ON(ret == 0);
4141
4142                 leaf = path->nodes[0];
4143                 nritems = btrfs_header_nritems(leaf);
4144                 if (path->slots[0] >= nritems) {
4145                         ret = btrfs_next_leaf(extent_root, path);
4146                         if (ret < 0)
4147                                 goto out;
4148                         if (ret > 0)
4149                                 goto next;
4150                         leaf = path->nodes[0];
4151                 }
4152
4153                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4154                 if (found_key.objectid == bytenr &&
4155                     found_key.type == BTRFS_EXTENT_REF_KEY) {
4156                         if (level < ref_path->shared_level)
4157                                 ref_path->shared_level = level;
4158                         goto found;
4159                 }
4160 next:
4161                 level--;
4162                 btrfs_release_path(extent_root, path);
4163                 cond_resched();
4164         }
4165         /* reached lowest level */
4166         ret = 1;
4167         goto out;
4168 walk_up:
4169         level = ref_path->current_level;
4170         while (level < BTRFS_MAX_LEVEL - 1) {
4171                 u64 ref_objectid;
4172                 if (level >= 0) {
4173                         bytenr = ref_path->nodes[level];
4174                 } else {
4175                         bytenr = ref_path->extent_start;
4176                 }
4177                 BUG_ON(bytenr == 0);
4178
4179                 key.objectid = bytenr;
4180                 key.offset = 0;
4181                 key.type = BTRFS_EXTENT_REF_KEY;
4182
4183                 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4184                 if (ret < 0)
4185                         goto out;
4186
4187                 leaf = path->nodes[0];
4188                 nritems = btrfs_header_nritems(leaf);
4189                 if (path->slots[0] >= nritems) {
4190                         ret = btrfs_next_leaf(extent_root, path);
4191                         if (ret < 0)
4192                                 goto out;
4193                         if (ret > 0) {
4194                                 /* the extent was freed by someone */
4195                                 if (ref_path->lowest_level == level)
4196                                         goto out;
4197                                 btrfs_release_path(extent_root, path);
4198                                 goto walk_down;
4199                         }
4200                         leaf = path->nodes[0];
4201                 }
4202
4203                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4204                 if (found_key.objectid != bytenr ||
4205                                 found_key.type != BTRFS_EXTENT_REF_KEY) {
4206                         /* the extent was freed by someone */
4207                         if (ref_path->lowest_level == level) {
4208                                 ret = 1;
4209                                 goto out;
4210                         }
4211                         btrfs_release_path(extent_root, path);
4212                         goto walk_down;
4213                 }
4214 found:
4215                 ref = btrfs_item_ptr(leaf, path->slots[0],
4216                                 struct btrfs_extent_ref);
4217                 ref_objectid = btrfs_ref_objectid(leaf, ref);
4218                 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4219                         if (first_time) {
4220                                 level = (int)ref_objectid;
4221                                 BUG_ON(level >= BTRFS_MAX_LEVEL);
4222                                 ref_path->lowest_level = level;
4223                                 ref_path->current_level = level;
4224                                 ref_path->nodes[level] = bytenr;
4225                         } else {
4226                                 WARN_ON(ref_objectid != level);
4227                         }
4228                 } else {
4229                         WARN_ON(level != -1);
4230                 }
4231                 first_time = 0;
4232
4233                 if (ref_path->lowest_level == level) {
4234                         ref_path->owner_objectid = ref_objectid;
4235                         ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
4236                 }
4237
4238                 /*
4239                  * the block is tree root or the block isn't in reference
4240                  * counted tree.
4241                  */
4242                 if (found_key.objectid == found_key.offset ||
4243                     is_cowonly_root(btrfs_ref_root(leaf, ref))) {
4244                         ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4245                         ref_path->root_generation =
4246                                 btrfs_ref_generation(leaf, ref);
4247                         if (level < 0) {
4248                                 /* special reference from the tree log */
4249                                 ref_path->nodes[0] = found_key.offset;
4250                                 ref_path->current_level = 0;
4251                         }
4252                         ret = 0;
4253                         goto out;
4254                 }
4255
4256                 level++;
4257                 BUG_ON(ref_path->nodes[level] != 0);
4258                 ref_path->nodes[level] = found_key.offset;
4259                 ref_path->current_level = level;
4260
4261                 /*
4262                  * the reference was created in the running transaction,
4263                  * no need to continue walking up.
4264                  */
4265                 if (btrfs_ref_generation(leaf, ref) == trans->transid) {
4266                         ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4267                         ref_path->root_generation =
4268                                 btrfs_ref_generation(leaf, ref);
4269                         ret = 0;
4270                         goto out;
4271                 }
4272
4273                 btrfs_release_path(extent_root, path);
4274                 cond_resched();
4275         }
4276         /* reached max tree level, but no tree root found. */
4277         BUG();
4278 out:
4279         btrfs_free_path(path);
4280         return ret;
4281 }
4282
4283 static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
4284                                 struct btrfs_root *extent_root,
4285                                 struct btrfs_ref_path *ref_path,
4286                                 u64 extent_start)
4287 {
4288         memset(ref_path, 0, sizeof(*ref_path));
4289         ref_path->extent_start = extent_start;
4290
4291         return __next_ref_path(trans, extent_root, ref_path, 1);
4292 }
4293
4294 static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
4295                                struct btrfs_root *extent_root,
4296                                struct btrfs_ref_path *ref_path)
4297 {
4298         return __next_ref_path(trans, extent_root, ref_path, 0);
4299 }
4300
4301 static int noinline get_new_locations(struct inode *reloc_inode,
4302                                       struct btrfs_key *extent_key,
4303                                       u64 offset, int no_fragment,
4304                                       struct disk_extent **extents,
4305                                       int *nr_extents)
4306 {
4307         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4308         struct btrfs_path *path;
4309         struct btrfs_file_extent_item *fi;
4310         struct extent_buffer *leaf;
4311         struct disk_extent *exts = *extents;
4312         struct btrfs_key found_key;
4313         u64 cur_pos;
4314         u64 last_byte;
4315         u32 nritems;
4316         int nr = 0;
4317         int max = *nr_extents;
4318         int ret;
4319
4320         WARN_ON(!no_fragment && *extents);
4321         if (!exts) {
4322                 max = 1;
4323                 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
4324                 if (!exts)
4325                         return -ENOMEM;
4326         }
4327
4328         path = btrfs_alloc_path();
4329         BUG_ON(!path);
4330
4331         cur_pos = extent_key->objectid - offset;
4332         last_byte = extent_key->objectid + extent_key->offset;
4333         ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
4334                                        cur_pos, 0);
4335         if (ret < 0)
4336                 goto out;
4337         if (ret > 0) {
4338                 ret = -ENOENT;
4339                 goto out;
4340         }
4341
4342         while (1) {
4343                 leaf = path->nodes[0];
4344                 nritems = btrfs_header_nritems(leaf);
4345                 if (path->slots[0] >= nritems) {
4346                         ret = btrfs_next_leaf(root, path);
4347                         if (ret < 0)
4348                                 goto out;
4349                         if (ret > 0)
4350                                 break;
4351                         leaf = path->nodes[0];
4352                 }
4353
4354                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4355                 if (found_key.offset != cur_pos ||
4356                     found_key.type != BTRFS_EXTENT_DATA_KEY ||
4357                     found_key.objectid != reloc_inode->i_ino)
4358                         break;
4359
4360                 fi = btrfs_item_ptr(leaf, path->slots[0],
4361                                     struct btrfs_file_extent_item);
4362                 if (btrfs_file_extent_type(leaf, fi) !=
4363                     BTRFS_FILE_EXTENT_REG ||
4364                     btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4365                         break;
4366
4367                 if (nr == max) {
4368                         struct disk_extent *old = exts;
4369                         max *= 2;
4370                         exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
4371                         memcpy(exts, old, sizeof(*exts) * nr);
4372                         if (old != *extents)
4373                                 kfree(old);
4374                 }
4375
4376                 exts[nr].disk_bytenr =
4377                         btrfs_file_extent_disk_bytenr(leaf, fi);
4378                 exts[nr].disk_num_bytes =
4379                         btrfs_file_extent_disk_num_bytes(leaf, fi);
4380                 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
4381                 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4382                 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
4383                 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
4384                 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
4385                 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
4386                                                                            fi);
4387                 BUG_ON(exts[nr].offset > 0);
4388                 BUG_ON(exts[nr].compression || exts[nr].encryption);
4389                 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
4390
4391                 cur_pos += exts[nr].num_bytes;
4392                 nr++;
4393
4394                 if (cur_pos + offset >= last_byte)
4395                         break;
4396
4397                 if (no_fragment) {
4398                         ret = 1;
4399                         goto out;
4400                 }
4401                 path->slots[0]++;
4402         }
4403
4404         WARN_ON(cur_pos + offset > last_byte);
4405         if (cur_pos + offset < last_byte) {
4406                 ret = -ENOENT;
4407                 goto out;
4408         }
4409         ret = 0;
4410 out:
4411         btrfs_free_path(path);
4412         if (ret) {
4413                 if (exts != *extents)
4414                         kfree(exts);
4415         } else {
4416                 *extents = exts;
4417                 *nr_extents = nr;
4418         }
4419         return ret;
4420 }
4421
4422 static int noinline replace_one_extent(struct btrfs_trans_handle *trans,
4423                                         struct btrfs_root *root,
4424                                         struct btrfs_path *path,
4425                                         struct btrfs_key *extent_key,
4426                                         struct btrfs_key *leaf_key,
4427                                         struct btrfs_ref_path *ref_path,
4428                                         struct disk_extent *new_extents,
4429                                         int nr_extents)
4430 {
4431         struct extent_buffer *leaf;
4432         struct btrfs_file_extent_item *fi;
4433         struct inode *inode = NULL;
4434         struct btrfs_key key;
4435         u64 lock_start = 0;
4436         u64 lock_end = 0;
4437         u64 num_bytes;
4438         u64 ext_offset;
4439         u64 first_pos;
4440         u32 nritems;
4441         int nr_scaned = 0;
4442         int extent_locked = 0;
4443         int extent_type;
4444         int ret;
4445
4446         memcpy(&key, leaf_key, sizeof(key));
4447         first_pos = INT_LIMIT(loff_t) - extent_key->offset;
4448         if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4449                 if (key.objectid < ref_path->owner_objectid ||
4450                     (key.objectid == ref_path->owner_objectid &&
4451                      key.type < BTRFS_EXTENT_DATA_KEY)) {
4452                         key.objectid = ref_path->owner_objectid;
4453                         key.type = BTRFS_EXTENT_DATA_KEY;
4454                         key.offset = 0;
4455                 }
4456         }
4457
4458         while (1) {
4459                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4460                 if (ret < 0)
4461                         goto out;
4462
4463                 leaf = path->nodes[0];
4464                 nritems = btrfs_header_nritems(leaf);
4465 next:
4466                 if (extent_locked && ret > 0) {
4467                         /*
4468                          * the file extent item was modified by someone
4469                          * before the extent got locked.
4470                          */
4471                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4472                                       lock_end, GFP_NOFS);
4473                         extent_locked = 0;
4474                 }
4475
4476                 if (path->slots[0] >= nritems) {
4477                         if (++nr_scaned > 2)
4478                                 break;
4479
4480                         BUG_ON(extent_locked);
4481                         ret = btrfs_next_leaf(root, path);
4482                         if (ret < 0)
4483                                 goto out;
4484                         if (ret > 0)
4485                                 break;
4486                         leaf = path->nodes[0];
4487                         nritems = btrfs_header_nritems(leaf);
4488                 }
4489
4490                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4491
4492                 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4493                         if ((key.objectid > ref_path->owner_objectid) ||
4494                             (key.objectid == ref_path->owner_objectid &&
4495                              key.type > BTRFS_EXTENT_DATA_KEY) ||
4496                             (key.offset >= first_pos + extent_key->offset))
4497                                 break;
4498                 }
4499
4500                 if (inode && key.objectid != inode->i_ino) {
4501                         BUG_ON(extent_locked);
4502                         btrfs_release_path(root, path);
4503                         mutex_unlock(&inode->i_mutex);
4504                         iput(inode);
4505                         inode = NULL;
4506                         continue;
4507                 }
4508
4509                 if (key.type != BTRFS_EXTENT_DATA_KEY) {
4510                         path->slots[0]++;
4511                         ret = 1;
4512                         goto next;
4513                 }
4514                 fi = btrfs_item_ptr(leaf, path->slots[0],
4515                                     struct btrfs_file_extent_item);
4516                 extent_type = btrfs_file_extent_type(leaf, fi);
4517                 if ((extent_type != BTRFS_FILE_EXTENT_REG &&
4518                      extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
4519                     (btrfs_file_extent_disk_bytenr(leaf, fi) !=
4520                      extent_key->objectid)) {
4521                         path->slots[0]++;
4522                         ret = 1;
4523                         goto next;
4524                 }
4525
4526                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4527                 ext_offset = btrfs_file_extent_offset(leaf, fi);
4528
4529                 if (first_pos > key.offset - ext_offset)
4530                         first_pos = key.offset - ext_offset;
4531
4532                 if (!extent_locked) {
4533                         lock_start = key.offset;
4534                         lock_end = lock_start + num_bytes - 1;
4535                 } else {
4536                         if (lock_start > key.offset ||
4537                             lock_end + 1 < key.offset + num_bytes) {
4538                                 unlock_extent(&BTRFS_I(inode)->io_tree,
4539                                               lock_start, lock_end, GFP_NOFS);
4540                                 extent_locked = 0;
4541                         }
4542                 }
4543
4544                 if (!inode) {
4545                         btrfs_release_path(root, path);
4546
4547                         inode = btrfs_iget_locked(root->fs_info->sb,
4548                                                   key.objectid, root);
4549                         if (inode->i_state & I_NEW) {
4550                                 BTRFS_I(inode)->root = root;
4551                                 BTRFS_I(inode)->location.objectid =
4552                                         key.objectid;
4553                                 BTRFS_I(inode)->location.type =
4554                                         BTRFS_INODE_ITEM_KEY;
4555                                 BTRFS_I(inode)->location.offset = 0;
4556                                 btrfs_read_locked_inode(inode);
4557                                 unlock_new_inode(inode);
4558                         }
4559                         /*
4560                          * some code call btrfs_commit_transaction while
4561                          * holding the i_mutex, so we can't use mutex_lock
4562                          * here.
4563                          */
4564                         if (is_bad_inode(inode) ||
4565                             !mutex_trylock(&inode->i_mutex)) {
4566                                 iput(inode);
4567                                 inode = NULL;
4568                                 key.offset = (u64)-1;
4569                                 goto skip;
4570                         }
4571                 }
4572
4573                 if (!extent_locked) {
4574                         struct btrfs_ordered_extent *ordered;
4575
4576                         btrfs_release_path(root, path);
4577
4578                         lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4579                                     lock_end, GFP_NOFS);
4580                         ordered = btrfs_lookup_first_ordered_extent(inode,
4581                                                                     lock_end);
4582                         if (ordered &&
4583                             ordered->file_offset <= lock_end &&
4584                             ordered->file_offset + ordered->len > lock_start) {
4585                                 unlock_extent(&BTRFS_I(inode)->io_tree,
4586                                               lock_start, lock_end, GFP_NOFS);
4587                                 btrfs_start_ordered_extent(inode, ordered, 1);
4588                                 btrfs_put_ordered_extent(ordered);
4589                                 key.offset += num_bytes;
4590                                 goto skip;
4591                         }
4592                         if (ordered)
4593                                 btrfs_put_ordered_extent(ordered);
4594
4595                         extent_locked = 1;
4596                         continue;
4597                 }
4598
4599                 if (nr_extents == 1) {
4600                         /* update extent pointer in place */
4601                         btrfs_set_file_extent_disk_bytenr(leaf, fi,
4602                                                 new_extents[0].disk_bytenr);
4603                         btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4604                                                 new_extents[0].disk_num_bytes);
4605                         btrfs_mark_buffer_dirty(leaf);
4606
4607                         btrfs_drop_extent_cache(inode, key.offset,
4608                                                 key.offset + num_bytes - 1, 0);
4609
4610                         ret = btrfs_inc_extent_ref(trans, root,
4611                                                 new_extents[0].disk_bytenr,
4612                                                 new_extents[0].disk_num_bytes,
4613                                                 leaf->start,
4614                                                 root->root_key.objectid,
4615                                                 trans->transid,
4616                                                 key.objectid);
4617                         BUG_ON(ret);
4618
4619                         ret = btrfs_free_extent(trans, root,
4620                                                 extent_key->objectid,
4621                                                 extent_key->offset,
4622                                                 leaf->start,
4623                                                 btrfs_header_owner(leaf),
4624                                                 btrfs_header_generation(leaf),
4625                                                 key.objectid, 0);
4626                         BUG_ON(ret);
4627
4628                         btrfs_release_path(root, path);
4629                         key.offset += num_bytes;
4630                 } else {
4631                         BUG_ON(1);
4632 #if 0
4633                         u64 alloc_hint;
4634                         u64 extent_len;
4635                         int i;
4636                         /*
4637                          * drop old extent pointer at first, then insert the
4638                          * new pointers one bye one
4639                          */
4640                         btrfs_release_path(root, path);
4641                         ret = btrfs_drop_extents(trans, root, inode, key.offset,
4642                                                  key.offset + num_bytes,
4643                                                  key.offset, &alloc_hint);
4644                         BUG_ON(ret);
4645
4646                         for (i = 0; i < nr_extents; i++) {
4647                                 if (ext_offset >= new_extents[i].num_bytes) {
4648                                         ext_offset -= new_extents[i].num_bytes;
4649                                         continue;
4650                                 }
4651                                 extent_len = min(new_extents[i].num_bytes -
4652                                                  ext_offset, num_bytes);
4653
4654                                 ret = btrfs_insert_empty_item(trans, root,
4655                                                               path, &key,
4656                                                               sizeof(*fi));
4657                                 BUG_ON(ret);
4658
4659                                 leaf = path->nodes[0];
4660                                 fi = btrfs_item_ptr(leaf, path->slots[0],
4661                                                 struct btrfs_file_extent_item);
4662                                 btrfs_set_file_extent_generation(leaf, fi,
4663                                                         trans->transid);
4664                                 btrfs_set_file_extent_type(leaf, fi,
4665                                                         BTRFS_FILE_EXTENT_REG);
4666                                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4667                                                 new_extents[i].disk_bytenr);
4668                                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4669                                                 new_extents[i].disk_num_bytes);
4670                                 btrfs_set_file_extent_ram_bytes(leaf, fi,
4671                                                 new_extents[i].ram_bytes);
4672
4673                                 btrfs_set_file_extent_compression(leaf, fi,
4674                                                 new_extents[i].compression);
4675                                 btrfs_set_file_extent_encryption(leaf, fi,
4676                                                 new_extents[i].encryption);
4677                                 btrfs_set_file_extent_other_encoding(leaf, fi,
4678                                                 new_extents[i].other_encoding);
4679
4680                                 btrfs_set_file_extent_num_bytes(leaf, fi,
4681                                                         extent_len);
4682                                 ext_offset += new_extents[i].offset;
4683                                 btrfs_set_file_extent_offset(leaf, fi,
4684                                                         ext_offset);
4685                                 btrfs_mark_buffer_dirty(leaf);
4686
4687                                 btrfs_drop_extent_cache(inode, key.offset,
4688                                                 key.offset + extent_len - 1, 0);
4689
4690                                 ret = btrfs_inc_extent_ref(trans, root,
4691                                                 new_extents[i].disk_bytenr,
4692                                                 new_extents[i].disk_num_bytes,
4693                                                 leaf->start,
4694                                                 root->root_key.objectid,
4695                                                 trans->transid, key.objectid);
4696                                 BUG_ON(ret);
4697                                 btrfs_release_path(root, path);
4698
4699                                 inode_add_bytes(inode, extent_len);
4700
4701                                 ext_offset = 0;
4702                                 num_bytes -= extent_len;
4703                                 key.offset += extent_len;
4704
4705                                 if (num_bytes == 0)
4706                                         break;
4707                         }
4708                         BUG_ON(i >= nr_extents);
4709 #endif
4710                 }
4711
4712                 if (extent_locked) {
4713                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4714                                       lock_end, GFP_NOFS);
4715                         extent_locked = 0;
4716                 }
4717 skip:
4718                 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4719                     key.offset >= first_pos + extent_key->offset)
4720                         break;
4721
4722                 cond_resched();
4723         }
4724         ret = 0;
4725 out:
4726         btrfs_release_path(root, path);
4727         if (inode) {
4728                 mutex_unlock(&inode->i_mutex);
4729                 if (extent_locked) {
4730                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4731                                       lock_end, GFP_NOFS);
4732                 }
4733                 iput(inode);
4734         }
4735         return ret;
4736 }
4737
4738 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4739                                struct btrfs_root *root,
4740                                struct extent_buffer *buf, u64 orig_start)
4741 {
4742         int level;
4743         int ret;
4744
4745         BUG_ON(btrfs_header_generation(buf) != trans->transid);
4746         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
4747
4748         level = btrfs_header_level(buf);
4749         if (level == 0) {
4750                 struct btrfs_leaf_ref *ref;
4751                 struct btrfs_leaf_ref *orig_ref;
4752
4753                 orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
4754                 if (!orig_ref)
4755                         return -ENOENT;
4756
4757                 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
4758                 if (!ref) {
4759                         btrfs_free_leaf_ref(root, orig_ref);
4760                         return -ENOMEM;
4761                 }
4762
4763                 ref->nritems = orig_ref->nritems;
4764                 memcpy(ref->extents, orig_ref->extents,
4765                         sizeof(ref->extents[0]) * ref->nritems);
4766
4767                 btrfs_free_leaf_ref(root, orig_ref);
4768
4769                 ref->root_gen = trans->transid;
4770                 ref->bytenr = buf->start;
4771                 ref->owner = btrfs_header_owner(buf);
4772                 ref->generation = btrfs_header_generation(buf);
4773                 ret = btrfs_add_leaf_ref(root, ref, 0);
4774                 WARN_ON(ret);
4775                 btrfs_free_leaf_ref(root, ref);
4776         }
4777         return 0;
4778 }
4779
4780 static int noinline invalidate_extent_cache(struct btrfs_root *root,
4781                                         struct extent_buffer *leaf,
4782                                         struct btrfs_block_group_cache *group,
4783                                         struct btrfs_root *target_root)
4784 {
4785         struct btrfs_key key;
4786         struct inode *inode = NULL;
4787         struct btrfs_file_extent_item *fi;
4788         u64 num_bytes;
4789         u64 skip_objectid = 0;
4790         u32 nritems;
4791         u32 i;
4792
4793         nritems = btrfs_header_nritems(leaf);
4794         for (i = 0; i < nritems; i++) {
4795                 btrfs_item_key_to_cpu(leaf, &key, i);
4796                 if (key.objectid == skip_objectid ||
4797                     key.type != BTRFS_EXTENT_DATA_KEY)
4798                         continue;
4799                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4800                 if (btrfs_file_extent_type(leaf, fi) ==
4801                     BTRFS_FILE_EXTENT_INLINE)
4802                         continue;
4803                 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4804                         continue;
4805                 if (!inode || inode->i_ino != key.objectid) {
4806                         iput(inode);
4807                         inode = btrfs_ilookup(target_root->fs_info->sb,
4808                                               key.objectid, target_root, 1);
4809                 }
4810                 if (!inode) {
4811                         skip_objectid = key.objectid;
4812                         continue;
4813                 }
4814                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4815
4816                 lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4817                             key.offset + num_bytes - 1, GFP_NOFS);
4818                 btrfs_drop_extent_cache(inode, key.offset,
4819                                         key.offset + num_bytes - 1, 1);
4820                 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4821                               key.offset + num_bytes - 1, GFP_NOFS);
4822                 cond_resched();
4823         }
4824         iput(inode);
4825         return 0;
4826 }
4827
4828 static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
4829                                         struct btrfs_root *root,
4830                                         struct extent_buffer *leaf,
4831                                         struct btrfs_block_group_cache *group,
4832                                         struct inode *reloc_inode)
4833 {
4834         struct btrfs_key key;
4835         struct btrfs_key extent_key;
4836         struct btrfs_file_extent_item *fi;
4837         struct btrfs_leaf_ref *ref;
4838         struct disk_extent *new_extent;
4839         u64 bytenr;
4840         u64 num_bytes;
4841         u32 nritems;
4842         u32 i;
4843         int ext_index;
4844         int nr_extent;
4845         int ret;
4846
4847         new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
4848         BUG_ON(!new_extent);
4849
4850         ref = btrfs_lookup_leaf_ref(root, leaf->start);
4851         BUG_ON(!ref);
4852
4853         ext_index = -1;
4854         nritems = btrfs_header_nritems(leaf);
4855         for (i = 0; i < nritems; i++) {
4856                 btrfs_item_key_to_cpu(leaf, &key, i);
4857                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
4858                         continue;
4859                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4860                 if (btrfs_file_extent_type(leaf, fi) ==
4861                     BTRFS_FILE_EXTENT_INLINE)
4862                         continue;
4863                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
4864                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
4865                 if (bytenr == 0)
4866                         continue;
4867
4868                 ext_index++;
4869                 if (bytenr >= group->key.objectid + group->key.offset ||
4870                     bytenr + num_bytes <= group->key.objectid)
4871                         continue;
4872
4873                 extent_key.objectid = bytenr;
4874                 extent_key.offset = num_bytes;
4875                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4876                 nr_extent = 1;
4877                 ret = get_new_locations(reloc_inode, &extent_key,
4878                                         group->key.objectid, 1,
4879                                         &new_extent, &nr_extent);
4880                 if (ret > 0)
4881                         continue;
4882                 BUG_ON(ret < 0);
4883
4884                 BUG_ON(ref->extents[ext_index].bytenr != bytenr);
4885                 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
4886                 ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
4887                 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
4888
4889                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4890                                                 new_extent->disk_bytenr);
4891                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4892                                                 new_extent->disk_num_bytes);
4893                 btrfs_mark_buffer_dirty(leaf);
4894
4895                 ret = btrfs_inc_extent_ref(trans, root,
4896                                         new_extent->disk_bytenr,
4897                                         new_extent->disk_num_bytes,
4898                                         leaf->start,
4899                                         root->root_key.objectid,
4900                                         trans->transid, key.objectid);
4901                 BUG_ON(ret);
4902                 ret = btrfs_free_extent(trans, root,
4903                                         bytenr, num_bytes, leaf->start,
4904                                         btrfs_header_owner(leaf),
4905                                         btrfs_header_generation(leaf),
4906                                         key.objectid, 0);
4907                 BUG_ON(ret);
4908                 cond_resched();
4909         }
4910         kfree(new_extent);
4911         BUG_ON(ext_index + 1 != ref->nritems);
4912         btrfs_free_leaf_ref(root, ref);
4913         return 0;
4914 }
4915
4916 int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
4917                           struct btrfs_root *root)
4918 {
4919         struct btrfs_root *reloc_root;
4920         int ret;
4921
4922         if (root->reloc_root) {
4923                 reloc_root = root->reloc_root;
4924                 root->reloc_root = NULL;
4925                 list_add(&reloc_root->dead_list,
4926                          &root->fs_info->dead_reloc_roots);
4927
4928                 btrfs_set_root_bytenr(&reloc_root->root_item,
4929                                       reloc_root->node->start);
4930                 btrfs_set_root_level(&root->root_item,
4931                                      btrfs_header_level(reloc_root->node));
4932                 memset(&reloc_root->root_item.drop_progress, 0,
4933                         sizeof(struct btrfs_disk_key));
4934                 reloc_root->root_item.drop_level = 0;
4935
4936                 ret = btrfs_update_root(trans, root->fs_info->tree_root,
4937                                         &reloc_root->root_key,
4938                                         &reloc_root->root_item);
4939                 BUG_ON(ret);
4940         }
4941         return 0;
4942 }
4943
4944 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
4945 {
4946         struct btrfs_trans_handle *trans;
4947         struct btrfs_root *reloc_root;
4948         struct btrfs_root *prev_root = NULL;
4949         struct list_head dead_roots;
4950         int ret;
4951         unsigned long nr;
4952
4953         INIT_LIST_HEAD(&dead_roots);
4954         list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
4955
4956         while (!list_empty(&dead_roots)) {
4957                 reloc_root = list_entry(dead_roots.prev,
4958                                         struct btrfs_root, dead_list);
4959                 list_del_init(&reloc_root->dead_list);
4960
4961                 BUG_ON(reloc_root->commit_root != NULL);
4962                 while (1) {
4963                         trans = btrfs_join_transaction(root, 1);
4964                         BUG_ON(!trans);
4965
4966                         mutex_lock(&root->fs_info->drop_mutex);
4967                         ret = btrfs_drop_snapshot(trans, reloc_root);
4968                         if (ret != -EAGAIN)
4969                                 break;
4970                         mutex_unlock(&root->fs_info->drop_mutex);
4971
4972                         nr = trans->blocks_used;
4973                         ret = btrfs_end_transaction(trans, root);
4974                         BUG_ON(ret);
4975                         btrfs_btree_balance_dirty(root, nr);
4976                 }
4977
4978                 free_extent_buffer(reloc_root->node);
4979
4980                 ret = btrfs_del_root(trans, root->fs_info->tree_root,
4981                                      &reloc_root->root_key);
4982                 BUG_ON(ret);
4983                 mutex_unlock(&root->fs_info->drop_mutex);
4984
4985                 nr = trans->blocks_used;
4986                 ret = btrfs_end_transaction(trans, root);
4987                 BUG_ON(ret);
4988                 btrfs_btree_balance_dirty(root, nr);
4989
4990                 kfree(prev_root);
4991                 prev_root = reloc_root;
4992         }
4993         if (prev_root) {
4994                 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
4995                 kfree(prev_root);
4996         }
4997         return 0;
4998 }
4999
5000 int btrfs_add_dead_reloc_root(struct btrfs_root *root)
5001 {
5002         list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
5003         return 0;
5004 }
5005
5006 int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
5007 {
5008         struct btrfs_root *reloc_root;
5009         struct btrfs_trans_handle *trans;
5010         struct btrfs_key location;
5011         int found;
5012         int ret;
5013
5014         mutex_lock(&root->fs_info->tree_reloc_mutex);
5015         ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
5016         BUG_ON(ret);
5017         found = !list_empty(&root->fs_info->dead_reloc_roots);
5018         mutex_unlock(&root->fs_info->tree_reloc_mutex);
5019
5020         if (found) {
5021                 trans = btrfs_start_transaction(root, 1);
5022                 BUG_ON(!trans);
5023                 ret = btrfs_commit_transaction(trans, root);
5024                 BUG_ON(ret);
5025         }
5026
5027         location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5028         location.offset = (u64)-1;
5029         location.type = BTRFS_ROOT_ITEM_KEY;
5030
5031         reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
5032         BUG_ON(!reloc_root);
5033         btrfs_orphan_cleanup(reloc_root);
5034         return 0;
5035 }
5036
5037 static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
5038                                     struct btrfs_root *root)
5039 {
5040         struct btrfs_root *reloc_root;
5041         struct extent_buffer *eb;
5042         struct btrfs_root_item *root_item;
5043         struct btrfs_key root_key;
5044         int ret;
5045
5046         BUG_ON(!root->ref_cows);
5047         if (root->reloc_root)
5048                 return 0;
5049
5050         root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
5051         BUG_ON(!root_item);
5052
5053         ret = btrfs_copy_root(trans, root, root->commit_root,
5054                               &eb, BTRFS_TREE_RELOC_OBJECTID);
5055         BUG_ON(ret);
5056
5057         root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
5058         root_key.offset = root->root_key.objectid;
5059         root_key.type = BTRFS_ROOT_ITEM_KEY;
5060
5061         memcpy(root_item, &root->root_item, sizeof(root_item));
5062         btrfs_set_root_refs(root_item, 0);
5063         btrfs_set_root_bytenr(root_item, eb->start);
5064         btrfs_set_root_level(root_item, btrfs_header_level(eb));
5065         btrfs_set_root_generation(root_item, trans->transid);
5066
5067         btrfs_tree_unlock(eb);
5068         free_extent_buffer(eb);
5069
5070         ret = btrfs_insert_root(trans, root->fs_info->tree_root,
5071                                 &root_key, root_item);
5072         BUG_ON(ret);
5073         kfree(root_item);
5074
5075         reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
5076                                                  &root_key);
5077         BUG_ON(!reloc_root);
5078         reloc_root->last_trans = trans->transid;
5079         reloc_root->commit_root = NULL;
5080         reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
5081
5082         root->reloc_root = reloc_root;
5083         return 0;
5084 }
5085
5086 /*
5087  * Core function of space balance.
5088  *
5089  * The idea is using reloc trees to relocate tree blocks in reference
5090  * counted roots. There is one reloc tree for each subvol, and all
5091  * reloc trees share same root key objectid. Reloc trees are snapshots
5092  * of the latest committed roots of subvols (root->commit_root).
5093  *
5094  * To relocate a tree block referenced by a subvol, there are two steps.
5095  * COW the block through subvol's reloc tree, then update block pointer
5096  * in the subvol to point to the new block. Since all reloc trees share
5097  * same root key objectid, doing special handing for tree blocks owned
5098  * by them is easy. Once a tree block has been COWed in one reloc tree,
5099  * we can use the resulting new block directly when the same block is
5100  * required to COW again through other reloc trees. By this way, relocated
5101  * tree blocks are shared between reloc trees, so they are also shared
5102  * between subvols.
5103  */
5104 static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
5105                                       struct btrfs_root *root,
5106                                       struct btrfs_path *path,
5107                                       struct btrfs_key *first_key,
5108                                       struct btrfs_ref_path *ref_path,
5109                                       struct btrfs_block_group_cache *group,
5110                                       struct inode *reloc_inode)
5111 {
5112         struct btrfs_root *reloc_root;
5113         struct extent_buffer *eb = NULL;
5114         struct btrfs_key *keys;
5115         u64 *nodes;
5116         int level;
5117         int shared_level;
5118         int lowest_level = 0;
5119         int ret;
5120
5121         if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
5122                 lowest_level = ref_path->owner_objectid;
5123
5124         if (!root->ref_cows) {
5125                 path->lowest_level = lowest_level;
5126                 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
5127                 BUG_ON(ret < 0);
5128                 path->lowest_level = 0;
5129                 btrfs_release_path(root, path);
5130                 return 0;
5131         }
5132
5133         mutex_lock(&root->fs_info->tree_reloc_mutex);
5134         ret = init_reloc_tree(trans, root);
5135         BUG_ON(ret);
5136         reloc_root = root->reloc_root;
5137
5138         shared_level = ref_path->shared_level;
5139         ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
5140
5141         keys = ref_path->node_keys;
5142         nodes = ref_path->new_nodes;
5143         memset(&keys[shared_level + 1], 0,
5144                sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
5145         memset(&nodes[shared_level + 1], 0,
5146                sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
5147
5148         if (nodes[lowest_level] == 0) {
5149                 path->lowest_level = lowest_level;
5150                 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5151                                         0, 1);
5152                 BUG_ON(ret);
5153                 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
5154                         eb = path->nodes[level];
5155                         if (!eb || eb == reloc_root->node)
5156                                 break;
5157                         nodes[level] = eb->start;
5158                         if (level == 0)
5159                                 btrfs_item_key_to_cpu(eb, &keys[level], 0);
5160                         else
5161                                 btrfs_node_key_to_cpu(eb, &keys[level], 0);
5162                 }
5163                 if (nodes[0] &&
5164                     ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5165                         eb = path->nodes[0];
5166                         ret = replace_extents_in_leaf(trans, reloc_root, eb,
5167                                                       group, reloc_inode);
5168                         BUG_ON(ret);
5169                 }
5170                 btrfs_release_path(reloc_root, path);
5171         } else {
5172                 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
5173                                        lowest_level);
5174                 BUG_ON(ret);
5175         }
5176
5177         /*
5178          * replace tree blocks in the fs tree with tree blocks in
5179          * the reloc tree.
5180          */
5181         ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
5182         BUG_ON(ret < 0);
5183
5184         if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5185                 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5186                                         0, 0);
5187                 BUG_ON(ret);
5188                 extent_buffer_get(path->nodes[0]);
5189                 eb = path->nodes[0];
5190                 btrfs_release_path(reloc_root, path);
5191                 ret = invalidate_extent_cache(reloc_root, eb, group, root);
5192                 BUG_ON(ret);
5193                 free_extent_buffer(eb);
5194         }
5195
5196         mutex_unlock(&root->fs_info->tree_reloc_mutex);
5197         path->lowest_level = 0;
5198         return 0;
5199 }
5200
5201 static int noinline relocate_tree_block(struct btrfs_trans_handle *trans,
5202                                         struct btrfs_root *root,
5203                                         struct btrfs_path *path,
5204                                         struct btrfs_key *first_key,
5205                                         struct btrfs_ref_path *ref_path)
5206 {
5207         int ret;
5208
5209         ret = relocate_one_path(trans, root, path, first_key,
5210                                 ref_path, NULL, NULL);
5211         BUG_ON(ret);
5212
5213         if (root == root->fs_info->extent_root)
5214                 btrfs_extent_post_op(trans, root);
5215
5216         return 0;
5217 }
5218
5219 static int noinline del_extent_zero(struct btrfs_trans_handle *trans,
5220                                     struct btrfs_root *extent_root,
5221                                     struct btrfs_path *path,
5222                                     struct btrfs_key *extent_key)
5223 {
5224         int ret;
5225
5226         ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
5227         if (ret)
5228                 goto out;
5229         ret = btrfs_del_item(trans, extent_root, path);
5230 out:
5231         btrfs_release_path(extent_root, path);
5232         return ret;
5233 }
5234
5235 static struct btrfs_root noinline *read_ref_root(struct btrfs_fs_info *fs_info,
5236                                                 struct btrfs_ref_path *ref_path)
5237 {
5238         struct btrfs_key root_key;
5239
5240         root_key.objectid = ref_path->root_objectid;
5241         root_key.type = BTRFS_ROOT_ITEM_KEY;
5242         if (is_cowonly_root(ref_path->root_objectid))
5243                 root_key.offset = 0;
5244         else
5245                 root_key.offset = (u64)-1;
5246
5247         return btrfs_read_fs_root_no_name(fs_info, &root_key);
5248 }
5249
5250 static int noinline relocate_one_extent(struct btrfs_root *extent_root,
5251                                         struct btrfs_path *path,
5252                                         struct btrfs_key *extent_key,
5253                                         struct btrfs_block_group_cache *group,
5254                                         struct inode *reloc_inode, int pass)
5255 {
5256         struct btrfs_trans_handle *trans;
5257         struct btrfs_root *found_root;
5258         struct btrfs_ref_path *ref_path = NULL;
5259         struct disk_extent *new_extents = NULL;
5260         int nr_extents = 0;
5261         int loops;
5262         int ret;
5263         int level;
5264         struct btrfs_key first_key;
5265         u64 prev_block = 0;
5266
5267
5268         trans = btrfs_start_transaction(extent_root, 1);
5269         BUG_ON(!trans);
5270
5271         if (extent_key->objectid == 0) {
5272                 ret = del_extent_zero(trans, extent_root, path, extent_key);
5273                 goto out;
5274         }
5275
5276         ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
5277         if (!ref_path) {
5278                ret = -ENOMEM;
5279                goto out;
5280         }
5281
5282         for (loops = 0; ; loops++) {
5283                 if (loops == 0) {
5284                         ret = btrfs_first_ref_path(trans, extent_root, ref_path,
5285                                                    extent_key->objectid);
5286                 } else {
5287                         ret = btrfs_next_ref_path(trans, extent_root, ref_path);
5288                 }
5289                 if (ret < 0)
5290                         goto out;
5291                 if (ret > 0)
5292                         break;
5293
5294                 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
5295                     ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
5296                         continue;
5297
5298                 found_root = read_ref_root(extent_root->fs_info, ref_path);
5299                 BUG_ON(!found_root);
5300                 /*
5301                  * for reference counted tree, only process reference paths
5302                  * rooted at the latest committed root.
5303                  */
5304                 if (found_root->ref_cows &&
5305                     ref_path->root_generation != found_root->root_key.offset)
5306                         continue;
5307
5308                 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5309                         if (pass == 0) {
5310                                 /*
5311                                  * copy data extents to new locations
5312                                  */
5313                                 u64 group_start = group->key.objectid;
5314                                 ret = relocate_data_extent(reloc_inode,
5315                                                            extent_key,
5316                                                            group_start);
5317                                 if (ret < 0)
5318                                         goto out;
5319                                 break;
5320                         }
5321                         level = 0;
5322                 } else {
5323                         level = ref_path->owner_objectid;
5324                 }
5325
5326                 if (prev_block != ref_path->nodes[level]) {
5327                         struct extent_buffer *eb;
5328                         u64 block_start = ref_path->nodes[level];
5329                         u64 block_size = btrfs_level_size(found_root, level);
5330
5331                         eb = read_tree_block(found_root, block_start,
5332                                              block_size, 0);
5333                         btrfs_tree_lock(eb);
5334                         BUG_ON(level != btrfs_header_level(eb));
5335
5336                         if (level == 0)
5337                                 btrfs_item_key_to_cpu(eb, &first_key, 0);
5338                         else
5339                                 btrfs_node_key_to_cpu(eb, &first_key, 0);
5340
5341                         btrfs_tree_unlock(eb);
5342                         free_extent_buffer(eb);
5343                         prev_block = block_start;
5344                 }
5345
5346                 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
5347                     pass >= 2) {
5348                         /*
5349                          * use fallback method to process the remaining
5350                          * references.
5351                          */
5352                         if (!new_extents) {
5353                                 u64 group_start = group->key.objectid;
5354                                 new_extents = kmalloc(sizeof(*new_extents),
5355                                                       GFP_NOFS);
5356                                 nr_extents = 1;
5357                                 ret = get_new_locations(reloc_inode,
5358                                                         extent_key,
5359                                                         group_start, 1,
5360                                                         &new_extents,
5361                                                         &nr_extents);
5362                                 if (ret)
5363                                         goto out;
5364                         }
5365                         btrfs_record_root_in_trans(found_root);
5366                         ret = replace_one_extent(trans, found_root,
5367                                                 path, extent_key,
5368                                                 &first_key, ref_path,
5369                                                 new_extents, nr_extents);
5370                         if (ret < 0)
5371                                 goto out;
5372                         continue;
5373                 }
5374
5375                 btrfs_record_root_in_trans(found_root);
5376                 if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
5377                         ret = relocate_tree_block(trans, found_root, path,
5378                                                   &first_key, ref_path);
5379                 } else {
5380                         /*
5381                          * try to update data extent references while
5382                          * keeping metadata shared between snapshots.
5383                          */
5384                         ret = relocate_one_path(trans, found_root, path,
5385                                                 &first_key, ref_path,
5386                                                 group, reloc_inode);
5387                 }
5388                 if (ret < 0)
5389                         goto out;
5390         }
5391         ret = 0;
5392 out:
5393         btrfs_end_transaction(trans, extent_root);
5394         kfree(new_extents);
5395         kfree(ref_path);
5396         return ret;
5397 }
5398
5399 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
5400 {
5401         u64 num_devices;
5402         u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
5403                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
5404
5405         num_devices = root->fs_info->fs_devices->rw_devices;
5406         if (num_devices == 1) {
5407                 stripped |= BTRFS_BLOCK_GROUP_DUP;
5408                 stripped = flags & ~stripped;
5409
5410                 /* turn raid0 into single device chunks */
5411                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
5412                         return stripped;
5413
5414                 /* turn mirroring into duplication */
5415                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
5416                              BTRFS_BLOCK_GROUP_RAID10))
5417                         return stripped | BTRFS_BLOCK_GROUP_DUP;
5418                 return flags;
5419         } else {
5420                 /* they already had raid on here, just return */
5421                 if (flags & stripped)
5422                         return flags;
5423
5424                 stripped |= BTRFS_BLOCK_GROUP_DUP;
5425                 stripped = flags & ~stripped;
5426
5427                 /* switch duplicated blocks with raid1 */
5428                 if (flags & BTRFS_BLOCK_GROUP_DUP)
5429                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
5430
5431                 /* turn single device chunks into raid0 */
5432                 return stripped | BTRFS_BLOCK_GROUP_RAID0;
5433         }
5434         return flags;
5435 }
5436
5437 static int __alloc_chunk_for_shrink(struct btrfs_root *root,
5438                      struct btrfs_block_group_cache *shrink_block_group,
5439                      int force)
5440 {
5441         struct btrfs_trans_handle *trans;
5442         u64 new_alloc_flags;
5443         u64 calc;
5444
5445         spin_lock(&shrink_block_group->lock);
5446         if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
5447                 spin_unlock(&shrink_block_group->lock);
5448
5449                 trans = btrfs_start_transaction(root, 1);
5450                 spin_lock(&shrink_block_group->lock);
5451
5452                 new_alloc_flags = update_block_group_flags(root,
5453                                                    shrink_block_group->flags);
5454                 if (new_alloc_flags != shrink_block_group->flags) {
5455                         calc =
5456                              btrfs_block_group_used(&shrink_block_group->item);
5457                 } else {
5458                         calc = shrink_block_group->key.offset;
5459                 }
5460                 spin_unlock(&shrink_block_group->lock);
5461
5462                 do_chunk_alloc(trans, root->fs_info->extent_root,
5463                                calc + 2 * 1024 * 1024, new_alloc_flags, force);
5464
5465                 btrfs_end_transaction(trans, root);
5466         } else
5467                 spin_unlock(&shrink_block_group->lock);
5468         return 0;
5469 }
5470
5471 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
5472                                  struct btrfs_root *root,
5473                                  u64 objectid, u64 size)
5474 {
5475         struct btrfs_path *path;
5476         struct btrfs_inode_item *item;
5477         struct extent_buffer *leaf;
5478         int ret;
5479
5480         path = btrfs_alloc_path();
5481         if (!path)
5482                 return -ENOMEM;
5483
5484         ret = btrfs_insert_empty_inode(trans, root, path, objectid);
5485         if (ret)
5486                 goto out;
5487
5488         leaf = path->nodes[0];
5489         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
5490         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
5491         btrfs_set_inode_generation(leaf, item, 1);
5492         btrfs_set_inode_size(leaf, item, size);
5493         btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
5494         btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NODATASUM |
5495                                           BTRFS_INODE_NOCOMPRESS);
5496         btrfs_mark_buffer_dirty(leaf);
5497         btrfs_release_path(root, path);
5498 out:
5499         btrfs_free_path(path);
5500         return ret;
5501 }
5502
5503 static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
5504                                         struct btrfs_block_group_cache *group)
5505 {
5506         struct inode *inode = NULL;
5507         struct btrfs_trans_handle *trans;
5508         struct btrfs_root *root;
5509         struct btrfs_key root_key;
5510         u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
5511         int err = 0;
5512
5513         root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5514         root_key.type = BTRFS_ROOT_ITEM_KEY;
5515         root_key.offset = (u64)-1;
5516         root = btrfs_read_fs_root_no_name(fs_info, &root_key);
5517         if (IS_ERR(root))
5518                 return ERR_CAST(root);
5519
5520         trans = btrfs_start_transaction(root, 1);
5521         BUG_ON(!trans);
5522
5523         err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
5524         if (err)
5525                 goto out;
5526
5527         err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
5528         BUG_ON(err);
5529
5530         err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
5531                                        group->key.offset, 0, group->key.offset,
5532                                        0, 0, 0);
5533         BUG_ON(err);
5534
5535         inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
5536         if (inode->i_state & I_NEW) {
5537                 BTRFS_I(inode)->root = root;
5538                 BTRFS_I(inode)->location.objectid = objectid;
5539                 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5540                 BTRFS_I(inode)->location.offset = 0;
5541                 btrfs_read_locked_inode(inode);
5542                 unlock_new_inode(inode);
5543                 BUG_ON(is_bad_inode(inode));
5544         } else {
5545                 BUG_ON(1);
5546         }
5547
5548         err = btrfs_orphan_add(trans, inode);
5549 out:
5550         btrfs_end_transaction(trans, root);
5551         if (err) {
5552                 if (inode)
5553                         iput(inode);
5554                 inode = ERR_PTR(err);
5555         }
5556         return inode;
5557 }
5558
5559 int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
5560 {
5561         struct btrfs_trans_handle *trans;
5562         struct btrfs_path *path;
5563         struct btrfs_fs_info *info = root->fs_info;
5564         struct extent_buffer *leaf;
5565         struct inode *reloc_inode;
5566         struct btrfs_block_group_cache *block_group;
5567         struct btrfs_key key;
5568         u64 skipped;
5569         u64 cur_byte;
5570         u64 total_found;
5571         u32 nritems;
5572         int ret;
5573         int progress;
5574         int pass = 0;
5575
5576         root = root->fs_info->extent_root;
5577
5578         block_group = btrfs_lookup_block_group(info, group_start);
5579         BUG_ON(!block_group);
5580
5581         printk("btrfs relocating block group %llu flags %llu\n",
5582                (unsigned long long)block_group->key.objectid,
5583                (unsigned long long)block_group->flags);
5584
5585         path = btrfs_alloc_path();
5586         BUG_ON(!path);
5587
5588         reloc_inode = create_reloc_inode(info, block_group);
5589         BUG_ON(IS_ERR(reloc_inode));
5590
5591         __alloc_chunk_for_shrink(root, block_group, 1);
5592         set_block_group_readonly(block_group);
5593
5594         btrfs_start_delalloc_inodes(info->tree_root);
5595         btrfs_wait_ordered_extents(info->tree_root, 0);
5596 again:
5597         skipped = 0;
5598         total_found = 0;
5599         progress = 0;
5600         key.objectid = block_group->key.objectid;
5601         key.offset = 0;
5602         key.type = 0;
5603         cur_byte = key.objectid;
5604
5605         trans = btrfs_start_transaction(info->tree_root, 1);
5606         btrfs_commit_transaction(trans, info->tree_root);
5607
5608         mutex_lock(&root->fs_info->cleaner_mutex);
5609         btrfs_clean_old_snapshots(info->tree_root);
5610         btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
5611         mutex_unlock(&root->fs_info->cleaner_mutex);
5612
5613         while(1) {
5614                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5615                 if (ret < 0)
5616                         goto out;
5617 next:
5618                 leaf = path->nodes[0];
5619                 nritems = btrfs_header_nritems(leaf);
5620                 if (path->slots[0] >= nritems) {
5621                         ret = btrfs_next_leaf(root, path);
5622                         if (ret < 0)
5623                                 goto out;
5624                         if (ret == 1) {
5625                                 ret = 0;
5626                                 break;
5627                         }
5628                         leaf = path->nodes[0];
5629                         nritems = btrfs_header_nritems(leaf);
5630                 }
5631
5632                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5633
5634                 if (key.objectid >= block_group->key.objectid +
5635                     block_group->key.offset)
5636                         break;
5637
5638                 if (progress && need_resched()) {
5639                         btrfs_release_path(root, path);
5640                         cond_resched();
5641                         progress = 0;
5642                         continue;
5643                 }
5644                 progress = 1;
5645
5646                 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
5647                     key.objectid + key.offset <= cur_byte) {
5648                         path->slots[0]++;
5649                         goto next;
5650                 }
5651
5652                 total_found++;
5653                 cur_byte = key.objectid + key.offset;
5654                 btrfs_release_path(root, path);
5655
5656                 __alloc_chunk_for_shrink(root, block_group, 0);
5657                 ret = relocate_one_extent(root, path, &key, block_group,
5658                                           reloc_inode, pass);
5659                 BUG_ON(ret < 0);
5660                 if (ret > 0)
5661                         skipped++;
5662
5663                 key.objectid = cur_byte;
5664                 key.type = 0;
5665                 key.offset = 0;
5666         }
5667
5668         btrfs_release_path(root, path);
5669
5670         if (pass == 0) {
5671                 btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
5672                 invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
5673                 WARN_ON(reloc_inode->i_mapping->nrpages);
5674         }
5675
5676         if (total_found > 0) {
5677                 printk("btrfs found %llu extents in pass %d\n",
5678                        (unsigned long long)total_found, pass);
5679                 pass++;
5680                 if (total_found == skipped && pass > 2) {
5681                         iput(reloc_inode);
5682                         reloc_inode = create_reloc_inode(info, block_group);
5683                         pass = 0;
5684                 }
5685                 goto again;
5686         }
5687
5688         /* delete reloc_inode */
5689         iput(reloc_inode);
5690
5691         /* unpin extents in this range */
5692         trans = btrfs_start_transaction(info->tree_root, 1);
5693         btrfs_commit_transaction(trans, info->tree_root);
5694
5695         spin_lock(&block_group->lock);
5696         WARN_ON(block_group->pinned > 0);
5697         WARN_ON(block_group->reserved > 0);
5698         WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
5699         spin_unlock(&block_group->lock);
5700         ret = 0;
5701 out:
5702         btrfs_free_path(path);
5703         return ret;
5704 }
5705
5706 static int find_first_block_group(struct btrfs_root *root,
5707                 struct btrfs_path *path, struct btrfs_key *key)
5708 {
5709         int ret = 0;
5710         struct btrfs_key found_key;
5711         struct extent_buffer *leaf;
5712         int slot;
5713
5714         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
5715         if (ret < 0)
5716                 goto out;
5717
5718         while(1) {
5719                 slot = path->slots[0];
5720                 leaf = path->nodes[0];
5721                 if (slot >= btrfs_header_nritems(leaf)) {
5722                         ret = btrfs_next_leaf(root, path);
5723                         if (ret == 0)
5724                                 continue;
5725                         if (ret < 0)
5726                                 goto out;
5727                         break;
5728                 }
5729                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5730
5731                 if (found_key.objectid >= key->objectid &&
5732                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
5733                         ret = 0;
5734                         goto out;
5735                 }
5736                 path->slots[0]++;
5737         }
5738         ret = -ENOENT;
5739 out:
5740         return ret;
5741 }
5742
5743 int btrfs_free_block_groups(struct btrfs_fs_info *info)
5744 {
5745         struct btrfs_block_group_cache *block_group;
5746         struct rb_node *n;
5747
5748         spin_lock(&info->block_group_cache_lock);
5749         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
5750                 block_group = rb_entry(n, struct btrfs_block_group_cache,
5751                                        cache_node);
5752                 rb_erase(&block_group->cache_node,
5753                          &info->block_group_cache_tree);
5754                 spin_unlock(&info->block_group_cache_lock);
5755
5756                 btrfs_remove_free_space_cache(block_group);
5757                 down_write(&block_group->space_info->groups_sem);
5758                 list_del(&block_group->list);
5759                 up_write(&block_group->space_info->groups_sem);
5760                 kfree(block_group);
5761
5762                 spin_lock(&info->block_group_cache_lock);
5763         }
5764         spin_unlock(&info->block_group_cache_lock);
5765         return 0;
5766 }
5767
5768 int btrfs_read_block_groups(struct btrfs_root *root)
5769 {
5770         struct btrfs_path *path;
5771         int ret;
5772         struct btrfs_block_group_cache *cache;
5773         struct btrfs_fs_info *info = root->fs_info;
5774         struct btrfs_space_info *space_info;
5775         struct btrfs_key key;
5776         struct btrfs_key found_key;
5777         struct extent_buffer *leaf;
5778
5779         root = info->extent_root;
5780         key.objectid = 0;
5781         key.offset = 0;
5782         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
5783         path = btrfs_alloc_path();
5784         if (!path)
5785                 return -ENOMEM;
5786
5787         while(1) {
5788                 ret = find_first_block_group(root, path, &key);
5789                 if (ret > 0) {
5790                         ret = 0;
5791                         goto error;
5792                 }
5793                 if (ret != 0)
5794                         goto error;
5795
5796                 leaf = path->nodes[0];
5797                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5798                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5799                 if (!cache) {
5800                         ret = -ENOMEM;
5801                         break;
5802                 }
5803
5804                 spin_lock_init(&cache->lock);
5805                 mutex_init(&cache->alloc_mutex);
5806                 mutex_init(&cache->cache_mutex);
5807                 INIT_LIST_HEAD(&cache->list);
5808                 read_extent_buffer(leaf, &cache->item,
5809                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
5810                                    sizeof(cache->item));
5811                 memcpy(&cache->key, &found_key, sizeof(found_key));
5812
5813                 key.objectid = found_key.objectid + found_key.offset;
5814                 btrfs_release_path(root, path);
5815                 cache->flags = btrfs_block_group_flags(&cache->item);
5816
5817                 ret = update_space_info(info, cache->flags, found_key.offset,
5818                                         btrfs_block_group_used(&cache->item),
5819                                         &space_info);
5820                 BUG_ON(ret);
5821                 cache->space_info = space_info;
5822                 down_write(&space_info->groups_sem);
5823                 list_add_tail(&cache->list, &space_info->block_groups);
5824                 up_write(&space_info->groups_sem);
5825
5826                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5827                 BUG_ON(ret);
5828
5829                 set_avail_alloc_bits(root->fs_info, cache->flags);
5830                 if (btrfs_chunk_readonly(root, cache->key.objectid))
5831                         set_block_group_readonly(cache);
5832         }
5833         ret = 0;
5834 error:
5835         btrfs_free_path(path);
5836         return ret;
5837 }
5838
5839 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5840                            struct btrfs_root *root, u64 bytes_used,
5841                            u64 type, u64 chunk_objectid, u64 chunk_offset,
5842                            u64 size)
5843 {
5844         int ret;
5845         struct btrfs_root *extent_root;
5846         struct btrfs_block_group_cache *cache;
5847
5848         extent_root = root->fs_info->extent_root;
5849
5850         root->fs_info->last_trans_new_blockgroup = trans->transid;
5851
5852         cache = kzalloc(sizeof(*cache), GFP_NOFS);
5853         if (!cache)
5854                 return -ENOMEM;
5855
5856         cache->key.objectid = chunk_offset;
5857         cache->key.offset = size;
5858         spin_lock_init(&cache->lock);
5859         mutex_init(&cache->alloc_mutex);
5860         mutex_init(&cache->cache_mutex);
5861         INIT_LIST_HEAD(&cache->list);
5862         btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
5863
5864         btrfs_set_block_group_used(&cache->item, bytes_used);
5865         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
5866         cache->flags = type;
5867         btrfs_set_block_group_flags(&cache->item, type);
5868
5869         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
5870                                 &cache->space_info);
5871         BUG_ON(ret);
5872         down_write(&cache->space_info->groups_sem);
5873         list_add_tail(&cache->list, &cache->space_info->block_groups);
5874         up_write(&cache->space_info->groups_sem);
5875
5876         ret = btrfs_add_block_group_cache(root->fs_info, cache);
5877         BUG_ON(ret);
5878
5879         ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
5880                                 sizeof(cache->item));
5881         BUG_ON(ret);
5882
5883         finish_current_insert(trans, extent_root, 0);
5884         ret = del_pending_extents(trans, extent_root, 0);
5885         BUG_ON(ret);
5886         set_avail_alloc_bits(extent_root->fs_info, type);
5887
5888         return 0;
5889 }
5890
5891 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5892                              struct btrfs_root *root, u64 group_start)
5893 {
5894         struct btrfs_path *path;
5895         struct btrfs_block_group_cache *block_group;
5896         struct btrfs_key key;
5897         int ret;
5898
5899         root = root->fs_info->extent_root;
5900
5901         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
5902         BUG_ON(!block_group);
5903         BUG_ON(!block_group->ro);
5904
5905         memcpy(&key, &block_group->key, sizeof(key));
5906
5907         path = btrfs_alloc_path();
5908         BUG_ON(!path);
5909
5910         btrfs_remove_free_space_cache(block_group);
5911         rb_erase(&block_group->cache_node,
5912                  &root->fs_info->block_group_cache_tree);
5913         down_write(&block_group->space_info->groups_sem);
5914         list_del(&block_group->list);
5915         up_write(&block_group->space_info->groups_sem);
5916
5917         spin_lock(&block_group->space_info->lock);
5918         block_group->space_info->total_bytes -= block_group->key.offset;
5919         block_group->space_info->bytes_readonly -= block_group->key.offset;
5920         spin_unlock(&block_group->space_info->lock);
5921         block_group->space_info->full = 0;
5922
5923         /*
5924         memset(shrink_block_group, 0, sizeof(*shrink_block_group));
5925         kfree(shrink_block_group);
5926         */
5927
5928         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5929         if (ret > 0)
5930                 ret = -EIO;
5931         if (ret < 0)
5932                 goto out;
5933
5934         ret = btrfs_del_item(trans, root, path);
5935 out:
5936         btrfs_free_path(path);
5937         return ret;
5938 }