Btrfs: fix cache_block_group error handling
[safe/jmp/linux-2.6] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include "hash.h"
23 #include "crc32c.h"
24 #include "ctree.h"
25 #include "disk-io.h"
26 #include "print-tree.h"
27 #include "transaction.h"
28 #include "volumes.h"
29 #include "locking.h"
30 #include "ref-cache.h"
31
32 #define BLOCK_GROUP_DATA     EXTENT_WRITEBACK
33 #define BLOCK_GROUP_METADATA EXTENT_UPTODATE
34 #define BLOCK_GROUP_SYSTEM   EXTENT_NEW
35
36 #define BLOCK_GROUP_DIRTY EXTENT_DIRTY
37
38 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
39                                  btrfs_root *extent_root);
40 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
41                                btrfs_root *extent_root);
42 static struct btrfs_block_group_cache *
43 __btrfs_find_block_group(struct btrfs_root *root,
44                          struct btrfs_block_group_cache *hint,
45                          u64 search_start, int data, int owner);
46
47 void maybe_lock_mutex(struct btrfs_root *root)
48 {
49         if (root != root->fs_info->extent_root &&
50             root != root->fs_info->chunk_root &&
51             root != root->fs_info->dev_root) {
52                 mutex_lock(&root->fs_info->alloc_mutex);
53         }
54 }
55
56 void maybe_unlock_mutex(struct btrfs_root *root)
57 {
58         if (root != root->fs_info->extent_root &&
59             root != root->fs_info->chunk_root &&
60             root != root->fs_info->dev_root) {
61                 mutex_unlock(&root->fs_info->alloc_mutex);
62         }
63 }
64
65 static int cache_block_group(struct btrfs_root *root,
66                              struct btrfs_block_group_cache *block_group)
67 {
68         struct btrfs_path *path;
69         int ret = 0;
70         struct btrfs_key key;
71         struct extent_buffer *leaf;
72         struct extent_io_tree *free_space_cache;
73         int slot;
74         u64 last = 0;
75         u64 hole_size;
76         u64 first_free;
77         int found = 0;
78
79         if (!block_group)
80                 return 0;
81
82         root = root->fs_info->extent_root;
83         free_space_cache = &root->fs_info->free_space_cache;
84
85         if (block_group->cached)
86                 return 0;
87
88         path = btrfs_alloc_path();
89         if (!path)
90                 return -ENOMEM;
91
92         path->reada = 2;
93         /*
94          * we get into deadlocks with paths held by callers of this function.
95          * since the alloc_mutex is protecting things right now, just
96          * skip the locking here
97          */
98         path->skip_locking = 1;
99         first_free = block_group->key.objectid;
100         key.objectid = block_group->key.objectid;
101         key.offset = 0;
102         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
103         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
104         if (ret < 0)
105                 goto err;
106         ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
107         if (ret < 0)
108                 goto err;
109         if (ret == 0) {
110                 leaf = path->nodes[0];
111                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
112                 if (key.objectid + key.offset > first_free)
113                         first_free = key.objectid + key.offset;
114         }
115         while(1) {
116                 leaf = path->nodes[0];
117                 slot = path->slots[0];
118                 if (slot >= btrfs_header_nritems(leaf)) {
119                         ret = btrfs_next_leaf(root, path);
120                         if (ret < 0)
121                                 goto err;
122                         if (ret == 0) {
123                                 continue;
124                         } else {
125                                 break;
126                         }
127                 }
128                 btrfs_item_key_to_cpu(leaf, &key, slot);
129                 if (key.objectid < block_group->key.objectid) {
130                         goto next;
131                 }
132                 if (key.objectid >= block_group->key.objectid +
133                     block_group->key.offset) {
134                         break;
135                 }
136
137                 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
138                         if (!found) {
139                                 last = first_free;
140                                 found = 1;
141                         }
142                         if (key.objectid > last) {
143                                 hole_size = key.objectid - last;
144                                 set_extent_dirty(free_space_cache, last,
145                                                  last + hole_size - 1,
146                                                  GFP_NOFS);
147                         }
148                         last = key.objectid + key.offset;
149                 }
150 next:
151                 path->slots[0]++;
152         }
153
154         if (!found)
155                 last = first_free;
156         if (block_group->key.objectid +
157             block_group->key.offset > last) {
158                 hole_size = block_group->key.objectid +
159                         block_group->key.offset - last;
160                 set_extent_dirty(free_space_cache, last,
161                                  last + hole_size - 1, GFP_NOFS);
162         }
163         block_group->cached = 1;
164         ret = 0;
165 err:
166         btrfs_free_path(path);
167         return ret;
168 }
169
170 struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
171                                                        btrfs_fs_info *info,
172                                                          u64 bytenr)
173 {
174         struct extent_io_tree *block_group_cache;
175         struct btrfs_block_group_cache *block_group = NULL;
176         u64 ptr;
177         u64 start;
178         u64 end;
179         int ret;
180
181         bytenr = max_t(u64, bytenr,
182                        BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
183         block_group_cache = &info->block_group_cache;
184         ret = find_first_extent_bit(block_group_cache,
185                                     bytenr, &start, &end,
186                                     BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
187                                     BLOCK_GROUP_SYSTEM);
188         if (ret) {
189                 return NULL;
190         }
191         ret = get_state_private(block_group_cache, start, &ptr);
192         if (ret)
193                 return NULL;
194
195         block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr;
196         return block_group;
197 }
198
199 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
200                                                          btrfs_fs_info *info,
201                                                          u64 bytenr)
202 {
203         struct extent_io_tree *block_group_cache;
204         struct btrfs_block_group_cache *block_group = NULL;
205         u64 ptr;
206         u64 start;
207         u64 end;
208         int ret;
209
210         bytenr = max_t(u64, bytenr,
211                        BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
212         block_group_cache = &info->block_group_cache;
213         ret = find_first_extent_bit(block_group_cache,
214                                     bytenr, &start, &end,
215                                     BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
216                                     BLOCK_GROUP_SYSTEM);
217         if (ret) {
218                 return NULL;
219         }
220         ret = get_state_private(block_group_cache, start, &ptr);
221         if (ret)
222                 return NULL;
223
224         block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr;
225         if (block_group->key.objectid <= bytenr && bytenr <
226             block_group->key.objectid + block_group->key.offset)
227                 return block_group;
228         return NULL;
229 }
230
231 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
232 {
233         return (cache->flags & bits) == bits;
234 }
235
236 static int noinline find_search_start(struct btrfs_root *root,
237                               struct btrfs_block_group_cache **cache_ret,
238                               u64 *start_ret, u64 num, int data)
239 {
240         int ret;
241         struct btrfs_block_group_cache *cache = *cache_ret;
242         struct extent_io_tree *free_space_cache;
243         struct extent_state *state;
244         u64 last;
245         u64 start = 0;
246         u64 cache_miss = 0;
247         u64 total_fs_bytes;
248         u64 search_start = *start_ret;
249         int wrapped = 0;
250
251         WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
252         total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
253         free_space_cache = &root->fs_info->free_space_cache;
254
255         if (!cache)
256                 goto out;
257
258 again:
259         ret = cache_block_group(root, cache);
260         if (ret) {
261                 goto out;
262         }
263
264         last = max(search_start, cache->key.objectid);
265         if (!block_group_bits(cache, data) || cache->ro)
266                 goto new_group;
267
268         spin_lock_irq(&free_space_cache->lock);
269         state = find_first_extent_bit_state(free_space_cache, last, EXTENT_DIRTY);
270         while(1) {
271                 if (!state) {
272                         if (!cache_miss)
273                                 cache_miss = last;
274                         spin_unlock_irq(&free_space_cache->lock);
275                         goto new_group;
276                 }
277
278                 start = max(last, state->start);
279                 last = state->end + 1;
280                 if (last - start < num) {
281                         do {
282                                 state = extent_state_next(state);
283                         } while(state && !(state->state & EXTENT_DIRTY));
284                         continue;
285                 }
286                 spin_unlock_irq(&free_space_cache->lock);
287                 if (cache->ro) {
288                         goto new_group;
289                 }
290                 if (start + num > cache->key.objectid + cache->key.offset)
291                         goto new_group;
292                 if (!block_group_bits(cache, data)) {
293                         printk("block group bits don't match %Lu %d\n", cache->flags, data);
294                 }
295                 *start_ret = start;
296                 return 0;
297         }
298 out:
299         cache = btrfs_lookup_block_group(root->fs_info, search_start);
300         if (!cache) {
301                 printk("Unable to find block group for %Lu\n", search_start);
302                 WARN_ON(1);
303         }
304         return -ENOSPC;
305
306 new_group:
307         last = cache->key.objectid + cache->key.offset;
308 wrapped:
309         cache = btrfs_lookup_first_block_group(root->fs_info, last);
310         if (!cache || cache->key.objectid >= total_fs_bytes) {
311 no_cache:
312                 if (!wrapped) {
313                         wrapped = 1;
314                         last = search_start;
315                         goto wrapped;
316                 }
317                 goto out;
318         }
319         if (cache_miss && !cache->cached) {
320                 cache_block_group(root, cache);
321                 last = cache_miss;
322                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
323         }
324         cache_miss = 0;
325         cache = btrfs_find_block_group(root, cache, last, data, 0);
326         if (!cache)
327                 goto no_cache;
328         *cache_ret = cache;
329         goto again;
330 }
331
332 static u64 div_factor(u64 num, int factor)
333 {
334         if (factor == 10)
335                 return num;
336         num *= factor;
337         do_div(num, 10);
338         return num;
339 }
340
341 static int block_group_state_bits(u64 flags)
342 {
343         int bits = 0;
344         if (flags & BTRFS_BLOCK_GROUP_DATA)
345                 bits |= BLOCK_GROUP_DATA;
346         if (flags & BTRFS_BLOCK_GROUP_METADATA)
347                 bits |= BLOCK_GROUP_METADATA;
348         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
349                 bits |= BLOCK_GROUP_SYSTEM;
350         return bits;
351 }
352
353 static struct btrfs_block_group_cache *
354 __btrfs_find_block_group(struct btrfs_root *root,
355                          struct btrfs_block_group_cache *hint,
356                          u64 search_start, int data, int owner)
357 {
358         struct btrfs_block_group_cache *cache;
359         struct extent_io_tree *block_group_cache;
360         struct btrfs_block_group_cache *found_group = NULL;
361         struct btrfs_fs_info *info = root->fs_info;
362         u64 used;
363         u64 last = 0;
364         u64 start;
365         u64 end;
366         u64 free_check;
367         u64 ptr;
368         int bit;
369         int ret;
370         int full_search = 0;
371         int factor = 10;
372         int wrapped = 0;
373
374         block_group_cache = &info->block_group_cache;
375
376         if (data & BTRFS_BLOCK_GROUP_METADATA)
377                 factor = 9;
378
379         bit = block_group_state_bits(data);
380
381         if (search_start) {
382                 struct btrfs_block_group_cache *shint;
383                 shint = btrfs_lookup_first_block_group(info, search_start);
384                 if (shint && block_group_bits(shint, data) && !shint->ro) {
385                         spin_lock(&shint->lock);
386                         used = btrfs_block_group_used(&shint->item);
387                         if (used + shint->pinned <
388                             div_factor(shint->key.offset, factor)) {
389                                 spin_unlock(&shint->lock);
390                                 return shint;
391                         }
392                         spin_unlock(&shint->lock);
393                 }
394         }
395         if (hint && !hint->ro && block_group_bits(hint, data)) {
396                 spin_lock(&hint->lock);
397                 used = btrfs_block_group_used(&hint->item);
398                 if (used + hint->pinned <
399                     div_factor(hint->key.offset, factor)) {
400                         spin_unlock(&hint->lock);
401                         return hint;
402                 }
403                 spin_unlock(&hint->lock);
404                 last = hint->key.objectid + hint->key.offset;
405         } else {
406                 if (hint)
407                         last = max(hint->key.objectid, search_start);
408                 else
409                         last = search_start;
410         }
411 again:
412         while(1) {
413                 ret = find_first_extent_bit(block_group_cache, last,
414                                             &start, &end, bit);
415                 if (ret)
416                         break;
417
418                 ret = get_state_private(block_group_cache, start, &ptr);
419                 if (ret) {
420                         last = end + 1;
421                         continue;
422                 }
423
424                 cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
425                 spin_lock(&cache->lock);
426                 last = cache->key.objectid + cache->key.offset;
427                 used = btrfs_block_group_used(&cache->item);
428
429                 if (!cache->ro && block_group_bits(cache, data)) {
430                         free_check = div_factor(cache->key.offset, factor);
431                         if (used + cache->pinned < free_check) {
432                                 found_group = cache;
433                                 spin_unlock(&cache->lock);
434                                 goto found;
435                         }
436                 }
437                 spin_unlock(&cache->lock);
438                 cond_resched();
439         }
440         if (!wrapped) {
441                 last = search_start;
442                 wrapped = 1;
443                 goto again;
444         }
445         if (!full_search && factor < 10) {
446                 last = search_start;
447                 full_search = 1;
448                 factor = 10;
449                 goto again;
450         }
451 found:
452         return found_group;
453 }
454
455 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
456                                                  struct btrfs_block_group_cache
457                                                  *hint, u64 search_start,
458                                                  int data, int owner)
459 {
460
461         struct btrfs_block_group_cache *ret;
462         ret = __btrfs_find_block_group(root, hint, search_start, data, owner);
463         return ret;
464 }
465 static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation,
466                            u64 owner, u64 owner_offset)
467 {
468         u32 high_crc = ~(u32)0;
469         u32 low_crc = ~(u32)0;
470         __le64 lenum;
471         lenum = cpu_to_le64(root_objectid);
472         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
473         lenum = cpu_to_le64(ref_generation);
474         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
475         if (owner >= BTRFS_FIRST_FREE_OBJECTID) {
476                 lenum = cpu_to_le64(owner);
477                 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
478                 lenum = cpu_to_le64(owner_offset);
479                 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
480         }
481         return ((u64)high_crc << 32) | (u64)low_crc;
482 }
483
484 static int match_extent_ref(struct extent_buffer *leaf,
485                             struct btrfs_extent_ref *disk_ref,
486                             struct btrfs_extent_ref *cpu_ref)
487 {
488         int ret;
489         int len;
490
491         if (cpu_ref->objectid)
492                 len = sizeof(*cpu_ref);
493         else
494                 len = 2 * sizeof(u64);
495         ret = memcmp_extent_buffer(leaf, cpu_ref, (unsigned long)disk_ref,
496                                    len);
497         return ret == 0;
498 }
499
500 /* simple helper to search for an existing extent at a given offset */
501 int btrfs_lookup_extent(struct btrfs_root *root, struct btrfs_path *path,
502                         u64 start, u64 len)
503 {
504         int ret;
505         struct btrfs_key key;
506
507         maybe_lock_mutex(root);
508         key.objectid = start;
509         key.offset = len;
510         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
511         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
512                                 0, 0);
513         maybe_unlock_mutex(root);
514         return ret;
515 }
516
517 static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
518                                           struct btrfs_root *root,
519                                           struct btrfs_path *path, u64 bytenr,
520                                           u64 root_objectid,
521                                           u64 ref_generation, u64 owner,
522                                           u64 owner_offset, int del)
523 {
524         u64 hash;
525         struct btrfs_key key;
526         struct btrfs_key found_key;
527         struct btrfs_extent_ref ref;
528         struct extent_buffer *leaf;
529         struct btrfs_extent_ref *disk_ref;
530         int ret;
531         int ret2;
532
533         btrfs_set_stack_ref_root(&ref, root_objectid);
534         btrfs_set_stack_ref_generation(&ref, ref_generation);
535         btrfs_set_stack_ref_objectid(&ref, owner);
536         btrfs_set_stack_ref_offset(&ref, owner_offset);
537
538         hash = hash_extent_ref(root_objectid, ref_generation, owner,
539                                owner_offset);
540         key.offset = hash;
541         key.objectid = bytenr;
542         key.type = BTRFS_EXTENT_REF_KEY;
543
544         while (1) {
545                 ret = btrfs_search_slot(trans, root, &key, path,
546                                         del ? -1 : 0, del);
547                 if (ret < 0)
548                         goto out;
549                 leaf = path->nodes[0];
550                 if (ret != 0) {
551                         u32 nritems = btrfs_header_nritems(leaf);
552                         if (path->slots[0] >= nritems) {
553                                 ret2 = btrfs_next_leaf(root, path);
554                                 if (ret2)
555                                         goto out;
556                                 leaf = path->nodes[0];
557                         }
558                         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
559                         if (found_key.objectid != bytenr ||
560                             found_key.type != BTRFS_EXTENT_REF_KEY)
561                                 goto out;
562                         key.offset = found_key.offset;
563                         if (del) {
564                                 btrfs_release_path(root, path);
565                                 continue;
566                         }
567                 }
568                 disk_ref = btrfs_item_ptr(path->nodes[0],
569                                           path->slots[0],
570                                           struct btrfs_extent_ref);
571                 if (match_extent_ref(path->nodes[0], disk_ref, &ref)) {
572                         ret = 0;
573                         goto out;
574                 }
575                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
576                 key.offset = found_key.offset + 1;
577                 btrfs_release_path(root, path);
578         }
579 out:
580         return ret;
581 }
582
583 /*
584  * Back reference rules.  Back refs have three main goals:
585  *
586  * 1) differentiate between all holders of references to an extent so that
587  *    when a reference is dropped we can make sure it was a valid reference
588  *    before freeing the extent.
589  *
590  * 2) Provide enough information to quickly find the holders of an extent
591  *    if we notice a given block is corrupted or bad.
592  *
593  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
594  *    maintenance.  This is actually the same as #2, but with a slightly
595  *    different use case.
596  *
597  * File extents can be referenced by:
598  *
599  * - multiple snapshots, subvolumes, or different generations in one subvol
600  * - different files inside a single subvolume (in theory, not implemented yet)
601  * - different offsets inside a file (bookend extents in file.c)
602  *
603  * The extent ref structure has fields for:
604  *
605  * - Objectid of the subvolume root
606  * - Generation number of the tree holding the reference
607  * - objectid of the file holding the reference
608  * - offset in the file corresponding to the key holding the reference
609  *
610  * When a file extent is allocated the fields are filled in:
611  *     (root_key.objectid, trans->transid, inode objectid, offset in file)
612  *
613  * When a leaf is cow'd new references are added for every file extent found
614  * in the leaf.  It looks the same as the create case, but trans->transid
615  * will be different when the block is cow'd.
616  *
617  *     (root_key.objectid, trans->transid, inode objectid, offset in file)
618  *
619  * When a file extent is removed either during snapshot deletion or file
620  * truncation, the corresponding back reference is found
621  * by searching for:
622  *
623  *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
624  *      inode objectid, offset in file)
625  *
626  * Btree extents can be referenced by:
627  *
628  * - Different subvolumes
629  * - Different generations of the same subvolume
630  *
631  * Storing sufficient information for a full reverse mapping of a btree
632  * block would require storing the lowest key of the block in the backref,
633  * and it would require updating that lowest key either before write out or
634  * every time it changed.  Instead, the objectid of the lowest key is stored
635  * along with the level of the tree block.  This provides a hint
636  * about where in the btree the block can be found.  Searches through the
637  * btree only need to look for a pointer to that block, so they stop one
638  * level higher than the level recorded in the backref.
639  *
640  * Some btrees do not do reference counting on their extents.  These
641  * include the extent tree and the tree of tree roots.  Backrefs for these
642  * trees always have a generation of zero.
643  *
644  * When a tree block is created, back references are inserted:
645  *
646  * (root->root_key.objectid, trans->transid or zero, level, lowest_key_objectid)
647  *
648  * When a tree block is cow'd in a reference counted root,
649  * new back references are added for all the blocks it points to.
650  * These are of the form (trans->transid will have increased since creation):
651  *
652  * (root->root_key.objectid, trans->transid, level, lowest_key_objectid)
653  *
654  * Because the lowest_key_objectid and the level are just hints
655  * they are not used when backrefs are deleted.  When a backref is deleted:
656  *
657  * if backref was for a tree root:
658  *     root_objectid = root->root_key.objectid
659  * else
660  *     root_objectid = btrfs_header_owner(parent)
661  *
662  * (root_objectid, btrfs_header_generation(parent) or zero, 0, 0)
663  *
664  * Back Reference Key hashing:
665  *
666  * Back references have four fields, each 64 bits long.  Unfortunately,
667  * This is hashed into a single 64 bit number and placed into the key offset.
668  * The key objectid corresponds to the first byte in the extent, and the
669  * key type is set to BTRFS_EXTENT_REF_KEY
670  */
671 int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
672                                  struct btrfs_root *root,
673                                  struct btrfs_path *path, u64 bytenr,
674                                  u64 root_objectid, u64 ref_generation,
675                                  u64 owner, u64 owner_offset)
676 {
677         u64 hash;
678         struct btrfs_key key;
679         struct btrfs_extent_ref ref;
680         struct btrfs_extent_ref *disk_ref;
681         int ret;
682
683         btrfs_set_stack_ref_root(&ref, root_objectid);
684         btrfs_set_stack_ref_generation(&ref, ref_generation);
685         btrfs_set_stack_ref_objectid(&ref, owner);
686         btrfs_set_stack_ref_offset(&ref, owner_offset);
687
688         hash = hash_extent_ref(root_objectid, ref_generation, owner,
689                                owner_offset);
690         key.offset = hash;
691         key.objectid = bytenr;
692         key.type = BTRFS_EXTENT_REF_KEY;
693
694         ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(ref));
695         while (ret == -EEXIST) {
696                 disk_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
697                                           struct btrfs_extent_ref);
698                 if (match_extent_ref(path->nodes[0], disk_ref, &ref))
699                         goto out;
700                 key.offset++;
701                 btrfs_release_path(root, path);
702                 ret = btrfs_insert_empty_item(trans, root, path, &key,
703                                               sizeof(ref));
704         }
705         if (ret)
706                 goto out;
707         disk_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
708                                   struct btrfs_extent_ref);
709         write_extent_buffer(path->nodes[0], &ref, (unsigned long)disk_ref,
710                             sizeof(ref));
711         btrfs_mark_buffer_dirty(path->nodes[0]);
712 out:
713         btrfs_release_path(root, path);
714         return ret;
715 }
716
717 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
718                                 struct btrfs_root *root,
719                                 u64 bytenr, u64 num_bytes,
720                                 u64 root_objectid, u64 ref_generation,
721                                 u64 owner, u64 owner_offset)
722 {
723         struct btrfs_path *path;
724         int ret;
725         struct btrfs_key key;
726         struct extent_buffer *l;
727         struct btrfs_extent_item *item;
728         u32 refs;
729
730         WARN_ON(num_bytes < root->sectorsize);
731         path = btrfs_alloc_path();
732         if (!path)
733                 return -ENOMEM;
734
735         path->reada = 1;
736         key.objectid = bytenr;
737         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
738         key.offset = num_bytes;
739         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
740                                 0, 1);
741         if (ret < 0)
742                 return ret;
743         if (ret != 0) {
744                 BUG();
745         }
746         BUG_ON(ret != 0);
747         l = path->nodes[0];
748         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
749         refs = btrfs_extent_refs(l, item);
750         btrfs_set_extent_refs(l, item, refs + 1);
751         btrfs_mark_buffer_dirty(path->nodes[0]);
752
753         btrfs_release_path(root->fs_info->extent_root, path);
754
755         path->reada = 1;
756         ret = btrfs_insert_extent_backref(trans, root->fs_info->extent_root,
757                                           path, bytenr, root_objectid,
758                                           ref_generation, owner, owner_offset);
759         BUG_ON(ret);
760         finish_current_insert(trans, root->fs_info->extent_root);
761         del_pending_extents(trans, root->fs_info->extent_root);
762
763         btrfs_free_path(path);
764         return 0;
765 }
766
767 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
768                                 struct btrfs_root *root,
769                                 u64 bytenr, u64 num_bytes,
770                                 u64 root_objectid, u64 ref_generation,
771                                 u64 owner, u64 owner_offset)
772 {
773         int ret;
774
775         mutex_lock(&root->fs_info->alloc_mutex);
776         ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
777                                      root_objectid, ref_generation,
778                                      owner, owner_offset);
779         mutex_unlock(&root->fs_info->alloc_mutex);
780         return ret;
781 }
782
783 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
784                          struct btrfs_root *root)
785 {
786         finish_current_insert(trans, root->fs_info->extent_root);
787         del_pending_extents(trans, root->fs_info->extent_root);
788         return 0;
789 }
790
791 static int lookup_extent_ref(struct btrfs_trans_handle *trans,
792                              struct btrfs_root *root, u64 bytenr,
793                              u64 num_bytes, u32 *refs)
794 {
795         struct btrfs_path *path;
796         int ret;
797         struct btrfs_key key;
798         struct extent_buffer *l;
799         struct btrfs_extent_item *item;
800
801         WARN_ON(num_bytes < root->sectorsize);
802         path = btrfs_alloc_path();
803         path->reada = 1;
804         key.objectid = bytenr;
805         key.offset = num_bytes;
806         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
807         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
808                                 0, 0);
809         if (ret < 0)
810                 goto out;
811         if (ret != 0) {
812                 btrfs_print_leaf(root, path->nodes[0]);
813                 printk("failed to find block number %Lu\n", bytenr);
814                 BUG();
815         }
816         l = path->nodes[0];
817         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
818         *refs = btrfs_extent_refs(l, item);
819 out:
820         btrfs_free_path(path);
821         return 0;
822 }
823
824
825 static int get_reference_status(struct btrfs_root *root, u64 bytenr,
826                                 u64 parent_gen, u64 ref_objectid,
827                                 u64 *min_generation, u32 *ref_count)
828 {
829         struct btrfs_root *extent_root = root->fs_info->extent_root;
830         struct btrfs_path *path;
831         struct extent_buffer *leaf;
832         struct btrfs_extent_ref *ref_item;
833         struct btrfs_key key;
834         struct btrfs_key found_key;
835         u64 root_objectid = root->root_key.objectid;
836         u64 ref_generation;
837         u32 nritems;
838         int ret;
839
840         key.objectid = bytenr;
841         key.offset = 0;
842         key.type = BTRFS_EXTENT_ITEM_KEY;
843
844         path = btrfs_alloc_path();
845         mutex_lock(&root->fs_info->alloc_mutex);
846         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
847         if (ret < 0)
848                 goto out;
849         BUG_ON(ret == 0);
850
851         leaf = path->nodes[0];
852         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
853
854         if (found_key.objectid != bytenr ||
855             found_key.type != BTRFS_EXTENT_ITEM_KEY) {
856                 ret = 1;
857                 goto out;
858         }
859
860         *ref_count = 0;
861         *min_generation = (u64)-1;
862
863         while (1) {
864                 leaf = path->nodes[0];
865                 nritems = btrfs_header_nritems(leaf);
866                 if (path->slots[0] >= nritems) {
867                         ret = btrfs_next_leaf(extent_root, path);
868                         if (ret < 0)
869                                 goto out;
870                         if (ret == 0)
871                                 continue;
872                         break;
873                 }
874                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
875                 if (found_key.objectid != bytenr)
876                         break;
877
878                 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
879                         path->slots[0]++;
880                         continue;
881                 }
882
883                 ref_item = btrfs_item_ptr(leaf, path->slots[0],
884                                           struct btrfs_extent_ref);
885                 ref_generation = btrfs_ref_generation(leaf, ref_item);
886                 /*
887                  * For (parent_gen > 0 && parent_gen > ref_gen):
888                  *
889                  * we reach here through the oldest root, therefore
890                  * all other reference from same snapshot should have
891                  * a larger generation.
892                  */
893                 if ((root_objectid != btrfs_ref_root(leaf, ref_item)) ||
894                     (parent_gen > 0 && parent_gen > ref_generation) ||
895                     (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
896                      ref_objectid != btrfs_ref_objectid(leaf, ref_item))) {
897                         if (ref_count)
898                                 *ref_count = 2;
899                         break;
900                 }
901
902                 *ref_count = 1;
903                 if (*min_generation > ref_generation)
904                         *min_generation = ref_generation;
905
906                 path->slots[0]++;
907         }
908         ret = 0;
909 out:
910         mutex_unlock(&root->fs_info->alloc_mutex);
911         btrfs_free_path(path);
912         return ret;
913 }
914
915 int btrfs_cross_ref_exists(struct btrfs_trans_handle *trans,
916                            struct btrfs_root *root,
917                            struct btrfs_key *key, u64 bytenr)
918 {
919         struct btrfs_root *old_root;
920         struct btrfs_path *path = NULL;
921         struct extent_buffer *eb;
922         struct btrfs_file_extent_item *item;
923         u64 ref_generation;
924         u64 min_generation;
925         u64 extent_start;
926         u32 ref_count;
927         int level;
928         int ret;
929
930         BUG_ON(trans == NULL);
931         BUG_ON(key->type != BTRFS_EXTENT_DATA_KEY);
932         ret = get_reference_status(root, bytenr, 0, key->objectid,
933                                    &min_generation, &ref_count);
934         if (ret)
935                 return ret;
936
937         if (ref_count != 1)
938                 return 1;
939
940         old_root = root->dirty_root->root;
941         ref_generation = old_root->root_key.offset;
942
943         /* all references are created in running transaction */
944         if (min_generation > ref_generation) {
945                 ret = 0;
946                 goto out;
947         }
948
949         path = btrfs_alloc_path();
950         if (!path) {
951                 ret = -ENOMEM;
952                 goto out;
953         }
954
955         path->skip_locking = 1;
956         /* if no item found, the extent is referenced by other snapshot */
957         ret = btrfs_search_slot(NULL, old_root, key, path, 0, 0);
958         if (ret)
959                 goto out;
960
961         eb = path->nodes[0];
962         item = btrfs_item_ptr(eb, path->slots[0],
963                               struct btrfs_file_extent_item);
964         if (btrfs_file_extent_type(eb, item) != BTRFS_FILE_EXTENT_REG ||
965             btrfs_file_extent_disk_bytenr(eb, item) != bytenr) {
966                 ret = 1;
967                 goto out;
968         }
969
970         for (level = BTRFS_MAX_LEVEL - 1; level >= -1; level--) {
971                 if (level >= 0) {
972                         eb = path->nodes[level];
973                         if (!eb)
974                                 continue;
975                         extent_start = eb->start;
976                 } else
977                         extent_start = bytenr;
978
979                 ret = get_reference_status(root, extent_start, ref_generation,
980                                            0, &min_generation, &ref_count);
981                 if (ret)
982                         goto out;
983
984                 if (ref_count != 1) {
985                         ret = 1;
986                         goto out;
987                 }
988                 if (level >= 0)
989                         ref_generation = btrfs_header_generation(eb);
990         }
991         ret = 0;
992 out:
993         if (path)
994                 btrfs_free_path(path);
995         return ret;
996 }
997
998 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
999                   struct extent_buffer *buf, int cache_ref)
1000 {
1001         u64 bytenr;
1002         u32 nritems;
1003         struct btrfs_key key;
1004         struct btrfs_file_extent_item *fi;
1005         int i;
1006         int level;
1007         int ret;
1008         int faili;
1009         int nr_file_extents = 0;
1010
1011         if (!root->ref_cows)
1012                 return 0;
1013
1014         level = btrfs_header_level(buf);
1015         nritems = btrfs_header_nritems(buf);
1016         for (i = 0; i < nritems; i++) {
1017                 cond_resched();
1018                 if (level == 0) {
1019                         u64 disk_bytenr;
1020                         btrfs_item_key_to_cpu(buf, &key, i);
1021                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1022                                 continue;
1023                         fi = btrfs_item_ptr(buf, i,
1024                                             struct btrfs_file_extent_item);
1025                         if (btrfs_file_extent_type(buf, fi) ==
1026                             BTRFS_FILE_EXTENT_INLINE)
1027                                 continue;
1028                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1029                         if (disk_bytenr == 0)
1030                                 continue;
1031
1032                         if (buf != root->commit_root)
1033                                 nr_file_extents++;
1034
1035                         mutex_lock(&root->fs_info->alloc_mutex);
1036                         ret = __btrfs_inc_extent_ref(trans, root, disk_bytenr,
1037                                     btrfs_file_extent_disk_num_bytes(buf, fi),
1038                                     root->root_key.objectid, trans->transid,
1039                                     key.objectid, key.offset);
1040                         mutex_unlock(&root->fs_info->alloc_mutex);
1041                         if (ret) {
1042                                 faili = i;
1043                                 WARN_ON(1);
1044                                 goto fail;
1045                         }
1046                 } else {
1047                         bytenr = btrfs_node_blockptr(buf, i);
1048                         btrfs_node_key_to_cpu(buf, &key, i);
1049
1050                         mutex_lock(&root->fs_info->alloc_mutex);
1051                         ret = __btrfs_inc_extent_ref(trans, root, bytenr,
1052                                            btrfs_level_size(root, level - 1),
1053                                            root->root_key.objectid,
1054                                            trans->transid,
1055                                            level - 1, key.objectid);
1056                         mutex_unlock(&root->fs_info->alloc_mutex);
1057                         if (ret) {
1058                                 faili = i;
1059                                 WARN_ON(1);
1060                                 goto fail;
1061                         }
1062                 }
1063         }
1064         /* cache orignal leaf block's references */
1065         if (level == 0 && cache_ref && buf != root->commit_root) {
1066                 struct btrfs_leaf_ref *ref;
1067                 struct btrfs_extent_info *info;
1068
1069                 ref = btrfs_alloc_leaf_ref(root, nr_file_extents);
1070                 if (!ref) {
1071                         WARN_ON(1);
1072                         goto out;
1073                 }
1074
1075                 ref->root_gen = root->root_key.offset;
1076                 ref->bytenr = buf->start;
1077                 ref->owner = btrfs_header_owner(buf);
1078                 ref->generation = btrfs_header_generation(buf);
1079                 ref->nritems = nr_file_extents;
1080                 info = ref->extents;
1081
1082                 for (i = 0; nr_file_extents > 0 && i < nritems; i++) {
1083                         u64 disk_bytenr;
1084                         btrfs_item_key_to_cpu(buf, &key, i);
1085                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1086                                 continue;
1087                         fi = btrfs_item_ptr(buf, i,
1088                                             struct btrfs_file_extent_item);
1089                         if (btrfs_file_extent_type(buf, fi) ==
1090                             BTRFS_FILE_EXTENT_INLINE)
1091                                 continue;
1092                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1093                         if (disk_bytenr == 0)
1094                                 continue;
1095
1096                         info->bytenr = disk_bytenr;
1097                         info->num_bytes =
1098                                 btrfs_file_extent_disk_num_bytes(buf, fi);
1099                         info->objectid = key.objectid;
1100                         info->offset = key.offset;
1101                         info++;
1102                 }
1103
1104                 BUG_ON(!root->ref_tree);
1105                 ret = btrfs_add_leaf_ref(root, ref);
1106                 WARN_ON(ret);
1107                 btrfs_free_leaf_ref(root, ref);
1108         }
1109 out:
1110         return 0;
1111 fail:
1112         WARN_ON(1);
1113 #if 0
1114         for (i =0; i < faili; i++) {
1115                 if (level == 0) {
1116                         u64 disk_bytenr;
1117                         btrfs_item_key_to_cpu(buf, &key, i);
1118                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1119                                 continue;
1120                         fi = btrfs_item_ptr(buf, i,
1121                                             struct btrfs_file_extent_item);
1122                         if (btrfs_file_extent_type(buf, fi) ==
1123                             BTRFS_FILE_EXTENT_INLINE)
1124                                 continue;
1125                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1126                         if (disk_bytenr == 0)
1127                                 continue;
1128                         err = btrfs_free_extent(trans, root, disk_bytenr,
1129                                     btrfs_file_extent_disk_num_bytes(buf,
1130                                                                       fi), 0);
1131                         BUG_ON(err);
1132                 } else {
1133                         bytenr = btrfs_node_blockptr(buf, i);
1134                         err = btrfs_free_extent(trans, root, bytenr,
1135                                         btrfs_level_size(root, level - 1), 0);
1136                         BUG_ON(err);
1137                 }
1138         }
1139 #endif
1140         return ret;
1141 }
1142
1143 static int write_one_cache_group(struct btrfs_trans_handle *trans,
1144                                  struct btrfs_root *root,
1145                                  struct btrfs_path *path,
1146                                  struct btrfs_block_group_cache *cache)
1147 {
1148         int ret;
1149         int pending_ret;
1150         struct btrfs_root *extent_root = root->fs_info->extent_root;
1151         unsigned long bi;
1152         struct extent_buffer *leaf;
1153
1154         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
1155         if (ret < 0)
1156                 goto fail;
1157         BUG_ON(ret);
1158
1159         leaf = path->nodes[0];
1160         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
1161         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
1162         btrfs_mark_buffer_dirty(leaf);
1163         btrfs_release_path(extent_root, path);
1164 fail:
1165         finish_current_insert(trans, extent_root);
1166         pending_ret = del_pending_extents(trans, extent_root);
1167         if (ret)
1168                 return ret;
1169         if (pending_ret)
1170                 return pending_ret;
1171         return 0;
1172
1173 }
1174
1175 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1176                                    struct btrfs_root *root)
1177 {
1178         struct extent_io_tree *block_group_cache;
1179         struct btrfs_block_group_cache *cache;
1180         int ret;
1181         int err = 0;
1182         int werr = 0;
1183         struct btrfs_path *path;
1184         u64 last = 0;
1185         u64 start;
1186         u64 end;
1187         u64 ptr;
1188
1189         block_group_cache = &root->fs_info->block_group_cache;
1190         path = btrfs_alloc_path();
1191         if (!path)
1192                 return -ENOMEM;
1193
1194         mutex_lock(&root->fs_info->alloc_mutex);
1195         while(1) {
1196                 ret = find_first_extent_bit(block_group_cache, last,
1197                                             &start, &end, BLOCK_GROUP_DIRTY);
1198                 if (ret)
1199                         break;
1200
1201                 last = end + 1;
1202                 ret = get_state_private(block_group_cache, start, &ptr);
1203                 if (ret)
1204                         break;
1205                 cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
1206                 err = write_one_cache_group(trans, root,
1207                                             path, cache);
1208                 /*
1209                  * if we fail to write the cache group, we want
1210                  * to keep it marked dirty in hopes that a later
1211                  * write will work
1212                  */
1213                 if (err) {
1214                         werr = err;
1215                         continue;
1216                 }
1217                 clear_extent_bits(block_group_cache, start, end,
1218                                   BLOCK_GROUP_DIRTY, GFP_NOFS);
1219         }
1220         btrfs_free_path(path);
1221         mutex_unlock(&root->fs_info->alloc_mutex);
1222         return werr;
1223 }
1224
1225 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
1226                                                   u64 flags)
1227 {
1228         struct list_head *head = &info->space_info;
1229         struct list_head *cur;
1230         struct btrfs_space_info *found;
1231         list_for_each(cur, head) {
1232                 found = list_entry(cur, struct btrfs_space_info, list);
1233                 if (found->flags == flags)
1234                         return found;
1235         }
1236         return NULL;
1237
1238 }
1239
1240 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1241                              u64 total_bytes, u64 bytes_used,
1242                              struct btrfs_space_info **space_info)
1243 {
1244         struct btrfs_space_info *found;
1245
1246         found = __find_space_info(info, flags);
1247         if (found) {
1248                 found->total_bytes += total_bytes;
1249                 found->bytes_used += bytes_used;
1250                 found->full = 0;
1251                 *space_info = found;
1252                 return 0;
1253         }
1254         found = kmalloc(sizeof(*found), GFP_NOFS);
1255         if (!found)
1256                 return -ENOMEM;
1257
1258         list_add(&found->list, &info->space_info);
1259         found->flags = flags;
1260         found->total_bytes = total_bytes;
1261         found->bytes_used = bytes_used;
1262         found->bytes_pinned = 0;
1263         found->full = 0;
1264         found->force_alloc = 0;
1265         *space_info = found;
1266         return 0;
1267 }
1268
1269 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1270 {
1271         u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1272                                    BTRFS_BLOCK_GROUP_RAID1 |
1273                                    BTRFS_BLOCK_GROUP_RAID10 |
1274                                    BTRFS_BLOCK_GROUP_DUP);
1275         if (extra_flags) {
1276                 if (flags & BTRFS_BLOCK_GROUP_DATA)
1277                         fs_info->avail_data_alloc_bits |= extra_flags;
1278                 if (flags & BTRFS_BLOCK_GROUP_METADATA)
1279                         fs_info->avail_metadata_alloc_bits |= extra_flags;
1280                 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1281                         fs_info->avail_system_alloc_bits |= extra_flags;
1282         }
1283 }
1284
1285 static u64 reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1286 {
1287         u64 num_devices = root->fs_info->fs_devices->num_devices;
1288
1289         if (num_devices == 1)
1290                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1291         if (num_devices < 4)
1292                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1293
1294         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1295             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1296                       BTRFS_BLOCK_GROUP_RAID10))) {
1297                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
1298         }
1299
1300         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1301             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1302                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1303         }
1304
1305         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1306             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1307              (flags & BTRFS_BLOCK_GROUP_RAID10) |
1308              (flags & BTRFS_BLOCK_GROUP_DUP)))
1309                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1310         return flags;
1311 }
1312
1313 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1314                           struct btrfs_root *extent_root, u64 alloc_bytes,
1315                           u64 flags, int force)
1316 {
1317         struct btrfs_space_info *space_info;
1318         u64 thresh;
1319         u64 start;
1320         u64 num_bytes;
1321         int ret;
1322
1323         flags = reduce_alloc_profile(extent_root, flags);
1324
1325         space_info = __find_space_info(extent_root->fs_info, flags);
1326         if (!space_info) {
1327                 ret = update_space_info(extent_root->fs_info, flags,
1328                                         0, 0, &space_info);
1329                 BUG_ON(ret);
1330         }
1331         BUG_ON(!space_info);
1332
1333         if (space_info->force_alloc) {
1334                 force = 1;
1335                 space_info->force_alloc = 0;
1336         }
1337         if (space_info->full)
1338                 goto out;
1339
1340         thresh = div_factor(space_info->total_bytes, 6);
1341         if (!force &&
1342            (space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
1343             thresh)
1344                 goto out;
1345
1346         mutex_lock(&extent_root->fs_info->chunk_mutex);
1347         ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
1348         if (ret == -ENOSPC) {
1349 printk("space info full %Lu\n", flags);
1350                 space_info->full = 1;
1351                 goto out_unlock;
1352         }
1353         BUG_ON(ret);
1354
1355         ret = btrfs_make_block_group(trans, extent_root, 0, flags,
1356                      BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
1357         BUG_ON(ret);
1358 out_unlock:
1359         mutex_unlock(&extent_root->fs_info->chunk_mutex);
1360 out:
1361         return 0;
1362 }
1363
1364 static int update_block_group(struct btrfs_trans_handle *trans,
1365                               struct btrfs_root *root,
1366                               u64 bytenr, u64 num_bytes, int alloc,
1367                               int mark_free)
1368 {
1369         struct btrfs_block_group_cache *cache;
1370         struct btrfs_fs_info *info = root->fs_info;
1371         u64 total = num_bytes;
1372         u64 old_val;
1373         u64 byte_in_group;
1374         u64 start;
1375         u64 end;
1376
1377         WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
1378         while(total) {
1379                 cache = btrfs_lookup_block_group(info, bytenr);
1380                 if (!cache) {
1381                         return -1;
1382                 }
1383                 byte_in_group = bytenr - cache->key.objectid;
1384                 WARN_ON(byte_in_group > cache->key.offset);
1385                 start = cache->key.objectid;
1386                 end = start + cache->key.offset - 1;
1387                 set_extent_bits(&info->block_group_cache, start, end,
1388                                 BLOCK_GROUP_DIRTY, GFP_NOFS);
1389
1390                 spin_lock(&cache->lock);
1391                 old_val = btrfs_block_group_used(&cache->item);
1392                 num_bytes = min(total, cache->key.offset - byte_in_group);
1393                 if (alloc) {
1394                         old_val += num_bytes;
1395                         cache->space_info->bytes_used += num_bytes;
1396                         btrfs_set_block_group_used(&cache->item, old_val);
1397                         spin_unlock(&cache->lock);
1398                 } else {
1399                         old_val -= num_bytes;
1400                         cache->space_info->bytes_used -= num_bytes;
1401                         btrfs_set_block_group_used(&cache->item, old_val);
1402                         spin_unlock(&cache->lock);
1403                         if (mark_free) {
1404                                 set_extent_dirty(&info->free_space_cache,
1405                                                  bytenr, bytenr + num_bytes - 1,
1406                                                  GFP_NOFS);
1407                         }
1408                 }
1409                 total -= num_bytes;
1410                 bytenr += num_bytes;
1411         }
1412         return 0;
1413 }
1414
1415 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
1416 {
1417         u64 start;
1418         u64 end;
1419         int ret;
1420         ret = find_first_extent_bit(&root->fs_info->block_group_cache,
1421                                     search_start, &start, &end,
1422                                     BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
1423                                     BLOCK_GROUP_SYSTEM);
1424         if (ret)
1425                 return 0;
1426         return start;
1427 }
1428
1429
1430 int btrfs_update_pinned_extents(struct btrfs_root *root,
1431                                 u64 bytenr, u64 num, int pin)
1432 {
1433         u64 len;
1434         struct btrfs_block_group_cache *cache;
1435         struct btrfs_fs_info *fs_info = root->fs_info;
1436
1437         WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
1438         if (pin) {
1439                 set_extent_dirty(&fs_info->pinned_extents,
1440                                 bytenr, bytenr + num - 1, GFP_NOFS);
1441         } else {
1442                 clear_extent_dirty(&fs_info->pinned_extents,
1443                                 bytenr, bytenr + num - 1, GFP_NOFS);
1444         }
1445         while (num > 0) {
1446                 cache = btrfs_lookup_block_group(fs_info, bytenr);
1447                 if (!cache) {
1448                         u64 first = first_logical_byte(root, bytenr);
1449                         WARN_ON(first < bytenr);
1450                         len = min(first - bytenr, num);
1451                 } else {
1452                         len = min(num, cache->key.offset -
1453                                   (bytenr - cache->key.objectid));
1454                 }
1455                 if (pin) {
1456                         if (cache) {
1457                                 spin_lock(&cache->lock);
1458                                 cache->pinned += len;
1459                                 cache->space_info->bytes_pinned += len;
1460                                 spin_unlock(&cache->lock);
1461                         }
1462                         fs_info->total_pinned += len;
1463                 } else {
1464                         if (cache) {
1465                                 spin_lock(&cache->lock);
1466                                 cache->pinned -= len;
1467                                 cache->space_info->bytes_pinned -= len;
1468                                 spin_unlock(&cache->lock);
1469                         }
1470                         fs_info->total_pinned -= len;
1471                 }
1472                 bytenr += len;
1473                 num -= len;
1474         }
1475         return 0;
1476 }
1477
1478 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
1479 {
1480         u64 last = 0;
1481         u64 start;
1482         u64 end;
1483         struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
1484         int ret;
1485
1486         while(1) {
1487                 ret = find_first_extent_bit(pinned_extents, last,
1488                                             &start, &end, EXTENT_DIRTY);
1489                 if (ret)
1490                         break;
1491                 set_extent_dirty(copy, start, end, GFP_NOFS);
1492                 last = end + 1;
1493         }
1494         return 0;
1495 }
1496
1497 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1498                                struct btrfs_root *root,
1499                                struct extent_io_tree *unpin)
1500 {
1501         u64 start;
1502         u64 end;
1503         int ret;
1504         struct extent_io_tree *free_space_cache;
1505         free_space_cache = &root->fs_info->free_space_cache;
1506
1507         mutex_lock(&root->fs_info->alloc_mutex);
1508         while(1) {
1509                 ret = find_first_extent_bit(unpin, 0, &start, &end,
1510                                             EXTENT_DIRTY);
1511                 if (ret)
1512                         break;
1513                 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
1514                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
1515                 set_extent_dirty(free_space_cache, start, end, GFP_NOFS);
1516                 if (need_resched()) {
1517                         mutex_unlock(&root->fs_info->alloc_mutex);
1518                         cond_resched();
1519                         mutex_lock(&root->fs_info->alloc_mutex);
1520                 }
1521         }
1522         mutex_unlock(&root->fs_info->alloc_mutex);
1523         return 0;
1524 }
1525
1526 static int finish_current_insert(struct btrfs_trans_handle *trans,
1527                                  struct btrfs_root *extent_root)
1528 {
1529         u64 start;
1530         u64 end;
1531         struct btrfs_fs_info *info = extent_root->fs_info;
1532         struct extent_buffer *eb;
1533         struct btrfs_path *path;
1534         struct btrfs_key ins;
1535         struct btrfs_disk_key first;
1536         struct btrfs_extent_item extent_item;
1537         int ret;
1538         int level;
1539         int err = 0;
1540
1541         WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
1542         btrfs_set_stack_extent_refs(&extent_item, 1);
1543         btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
1544         path = btrfs_alloc_path();
1545
1546         while(1) {
1547                 ret = find_first_extent_bit(&info->extent_ins, 0, &start,
1548                                             &end, EXTENT_LOCKED);
1549                 if (ret)
1550                         break;
1551
1552                 ins.objectid = start;
1553                 ins.offset = end + 1 - start;
1554                 err = btrfs_insert_item(trans, extent_root, &ins,
1555                                         &extent_item, sizeof(extent_item));
1556                 clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED,
1557                                   GFP_NOFS);
1558
1559                 eb = btrfs_find_create_tree_block(extent_root, ins.objectid,
1560                                            ins.offset);
1561
1562                 if (!btrfs_buffer_uptodate(eb, trans->transid))
1563                         btrfs_read_buffer(eb, trans->transid);
1564
1565                 btrfs_tree_lock(eb);
1566                 level = btrfs_header_level(eb);
1567                 if (level == 0) {
1568                         btrfs_item_key(eb, &first, 0);
1569                 } else {
1570                         btrfs_node_key(eb, &first, 0);
1571                 }
1572                 btrfs_tree_unlock(eb);
1573                 free_extent_buffer(eb);
1574                 /*
1575                  * the first key is just a hint, so the race we've created
1576                  * against reading it is fine
1577                  */
1578                 err = btrfs_insert_extent_backref(trans, extent_root, path,
1579                                           start, extent_root->root_key.objectid,
1580                                           0, level,
1581                                           btrfs_disk_key_objectid(&first));
1582                 BUG_ON(err);
1583                 if (need_resched()) {
1584                         mutex_unlock(&extent_root->fs_info->alloc_mutex);
1585                         cond_resched();
1586                         mutex_lock(&extent_root->fs_info->alloc_mutex);
1587                 }
1588         }
1589         btrfs_free_path(path);
1590         return 0;
1591 }
1592
1593 static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
1594                           int is_data, int pending)
1595 {
1596         int err = 0;
1597
1598         WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
1599         if (!pending) {
1600                 struct extent_buffer *buf;
1601
1602                 if (is_data)
1603                         goto pinit;
1604
1605                 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
1606                 if (buf) {
1607                         /* we can reuse a block if it hasn't been written
1608                          * and it is from this transaction.  We can't
1609                          * reuse anything from the tree log root because
1610                          * it has tiny sub-transactions.
1611                          */
1612                         if (btrfs_buffer_uptodate(buf, 0) &&
1613                             btrfs_try_tree_lock(buf)) {
1614                                 u64 transid =
1615                                     root->fs_info->running_transaction->transid;
1616                                 u64 header_transid =
1617                                         btrfs_header_generation(buf);
1618                                 if (btrfs_header_owner(buf) !=
1619                                     BTRFS_TREE_LOG_OBJECTID &&
1620                                     header_transid == transid &&
1621                                     !btrfs_header_flag(buf,
1622                                                BTRFS_HEADER_FLAG_WRITTEN)) {
1623                                         clean_tree_block(NULL, root, buf);
1624                                         btrfs_tree_unlock(buf);
1625                                         free_extent_buffer(buf);
1626                                         return 1;
1627                                 }
1628                                 btrfs_tree_unlock(buf);
1629                         }
1630                         free_extent_buffer(buf);
1631                 }
1632 pinit:
1633                 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
1634         } else {
1635                 set_extent_bits(&root->fs_info->pending_del,
1636                                 bytenr, bytenr + num_bytes - 1,
1637                                 EXTENT_LOCKED, GFP_NOFS);
1638         }
1639         BUG_ON(err < 0);
1640         return 0;
1641 }
1642
1643 /*
1644  * remove an extent from the root, returns 0 on success
1645  */
1646 static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
1647                          *root, u64 bytenr, u64 num_bytes,
1648                          u64 root_objectid, u64 ref_generation,
1649                          u64 owner_objectid, u64 owner_offset, int pin,
1650                          int mark_free)
1651 {
1652         struct btrfs_path *path;
1653         struct btrfs_key key;
1654         struct btrfs_fs_info *info = root->fs_info;
1655         struct btrfs_root *extent_root = info->extent_root;
1656         struct extent_buffer *leaf;
1657         int ret;
1658         int extent_slot = 0;
1659         int found_extent = 0;
1660         int num_to_del = 1;
1661         struct btrfs_extent_item *ei;
1662         u32 refs;
1663
1664         WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
1665         key.objectid = bytenr;
1666         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
1667         key.offset = num_bytes;
1668         path = btrfs_alloc_path();
1669         if (!path)
1670                 return -ENOMEM;
1671
1672         path->reada = 1;
1673         ret = lookup_extent_backref(trans, extent_root, path,
1674                                     bytenr, root_objectid,
1675                                     ref_generation,
1676                                     owner_objectid, owner_offset, 1);
1677         if (ret == 0) {
1678                 struct btrfs_key found_key;
1679                 extent_slot = path->slots[0];
1680                 while(extent_slot > 0) {
1681                         extent_slot--;
1682                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1683                                               extent_slot);
1684                         if (found_key.objectid != bytenr)
1685                                 break;
1686                         if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
1687                             found_key.offset == num_bytes) {
1688                                 found_extent = 1;
1689                                 break;
1690                         }
1691                         if (path->slots[0] - extent_slot > 5)
1692                                 break;
1693                 }
1694                 if (!found_extent)
1695                         ret = btrfs_del_item(trans, extent_root, path);
1696         } else {
1697                 btrfs_print_leaf(extent_root, path->nodes[0]);
1698                 WARN_ON(1);
1699                 printk("Unable to find ref byte nr %Lu root %Lu "
1700                        " gen %Lu owner %Lu offset %Lu\n", bytenr,
1701                        root_objectid, ref_generation, owner_objectid,
1702                        owner_offset);
1703         }
1704         if (!found_extent) {
1705                 btrfs_release_path(extent_root, path);
1706                 ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
1707                 if (ret < 0)
1708                         return ret;
1709                 BUG_ON(ret);
1710                 extent_slot = path->slots[0];
1711         }
1712
1713         leaf = path->nodes[0];
1714         ei = btrfs_item_ptr(leaf, extent_slot,
1715                             struct btrfs_extent_item);
1716         refs = btrfs_extent_refs(leaf, ei);
1717         BUG_ON(refs == 0);
1718         refs -= 1;
1719         btrfs_set_extent_refs(leaf, ei, refs);
1720
1721         btrfs_mark_buffer_dirty(leaf);
1722
1723         if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
1724                 /* if the back ref and the extent are next to each other
1725                  * they get deleted below in one shot
1726                  */
1727                 path->slots[0] = extent_slot;
1728                 num_to_del = 2;
1729         } else if (found_extent) {
1730                 /* otherwise delete the extent back ref */
1731                 ret = btrfs_del_item(trans, extent_root, path);
1732                 BUG_ON(ret);
1733                 /* if refs are 0, we need to setup the path for deletion */
1734                 if (refs == 0) {
1735                         btrfs_release_path(extent_root, path);
1736                         ret = btrfs_search_slot(trans, extent_root, &key, path,
1737                                                 -1, 1);
1738                         if (ret < 0)
1739                                 return ret;
1740                         BUG_ON(ret);
1741                 }
1742         }
1743
1744         if (refs == 0) {
1745                 u64 super_used;
1746                 u64 root_used;
1747 #ifdef BIO_RW_DISCARD
1748                 u64 map_length = num_bytes;
1749                 struct btrfs_multi_bio *multi = NULL;
1750 #endif
1751
1752                 if (pin) {
1753                         ret = pin_down_bytes(root, bytenr, num_bytes,
1754                              owner_objectid >= BTRFS_FIRST_FREE_OBJECTID, 0);
1755                         if (ret > 0)
1756                                 mark_free = 1;
1757                         BUG_ON(ret < 0);
1758                 }
1759
1760                 /* block accounting for super block */
1761                 spin_lock_irq(&info->delalloc_lock);
1762                 super_used = btrfs_super_bytes_used(&info->super_copy);
1763                 btrfs_set_super_bytes_used(&info->super_copy,
1764                                            super_used - num_bytes);
1765                 spin_unlock_irq(&info->delalloc_lock);
1766
1767                 /* block accounting for root item */
1768                 root_used = btrfs_root_used(&root->root_item);
1769                 btrfs_set_root_used(&root->root_item,
1770                                            root_used - num_bytes);
1771                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
1772                                       num_to_del);
1773                 if (ret) {
1774                         return ret;
1775                 }
1776                 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
1777                                          mark_free);
1778                 BUG_ON(ret);
1779
1780 #ifdef BIO_RW_DISCARD
1781                 /* Tell the block device(s) that the sectors can be discarded */
1782                 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
1783                                       bytenr, &map_length, &multi, 0);
1784                 if (!ret) {
1785                         struct btrfs_bio_stripe *stripe = multi->stripes;
1786                         int i;
1787
1788                         if (map_length > num_bytes)
1789                                 map_length = num_bytes;
1790
1791                         for (i = 0; i < multi->num_stripes; i++, stripe++) {
1792                                 blkdev_issue_discard(stripe->dev->bdev,
1793                                                      stripe->physical >> 9,
1794                                                      map_length >> 9);
1795                         }
1796                         kfree(multi);
1797                 }
1798 #endif
1799         }
1800         btrfs_free_path(path);
1801         finish_current_insert(trans, extent_root);
1802         return ret;
1803 }
1804
1805 /*
1806  * find all the blocks marked as pending in the radix tree and remove
1807  * them from the extent map
1808  */
1809 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
1810                                btrfs_root *extent_root)
1811 {
1812         int ret;
1813         int err = 0;
1814         u64 start;
1815         u64 end;
1816         struct extent_io_tree *pending_del;
1817         struct extent_io_tree *pinned_extents;
1818
1819         WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
1820         pending_del = &extent_root->fs_info->pending_del;
1821         pinned_extents = &extent_root->fs_info->pinned_extents;
1822
1823         while(1) {
1824                 ret = find_first_extent_bit(pending_del, 0, &start, &end,
1825                                             EXTENT_LOCKED);
1826                 if (ret)
1827                         break;
1828                 clear_extent_bits(pending_del, start, end, EXTENT_LOCKED,
1829                                   GFP_NOFS);
1830                 if (!test_range_bit(&extent_root->fs_info->extent_ins,
1831                                     start, end, EXTENT_LOCKED, 0)) {
1832                         btrfs_update_pinned_extents(extent_root, start,
1833                                               end + 1 - start, 1);
1834                         ret = __free_extent(trans, extent_root,
1835                                              start, end + 1 - start,
1836                                              extent_root->root_key.objectid,
1837                                              0, 0, 0, 0, 0);
1838                 } else {
1839                         clear_extent_bits(&extent_root->fs_info->extent_ins,
1840                                           start, end, EXTENT_LOCKED, GFP_NOFS);
1841                 }
1842                 if (ret)
1843                         err = ret;
1844
1845                 if (need_resched()) {
1846                         mutex_unlock(&extent_root->fs_info->alloc_mutex);
1847                         cond_resched();
1848                         mutex_lock(&extent_root->fs_info->alloc_mutex);
1849                 }
1850         }
1851         return err;
1852 }
1853
1854 /*
1855  * remove an extent from the root, returns 0 on success
1856  */
1857 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
1858                                struct btrfs_root *root, u64 bytenr,
1859                                u64 num_bytes, u64 root_objectid,
1860                                u64 ref_generation, u64 owner_objectid,
1861                                u64 owner_offset, int pin)
1862 {
1863         struct btrfs_root *extent_root = root->fs_info->extent_root;
1864         int pending_ret;
1865         int ret;
1866
1867         WARN_ON(num_bytes < root->sectorsize);
1868         if (!root->ref_cows)
1869                 ref_generation = 0;
1870
1871         if (root == extent_root) {
1872                 pin_down_bytes(root, bytenr, num_bytes, 0, 1);
1873                 return 0;
1874         }
1875         /* if metadata always pin */
1876         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
1877                 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
1878                         /* btrfs_free_reserved_extent */
1879                         set_extent_dirty(&root->fs_info->free_space_cache,
1880                                  bytenr, bytenr + num_bytes - 1, GFP_NOFS);
1881                         return 0;
1882                 }
1883                 pin = 1;
1884         }
1885
1886         /* if data pin when any transaction has committed this */
1887         if (ref_generation != trans->transid)
1888                 pin = 1;
1889
1890         ret = __free_extent(trans, root, bytenr, num_bytes, root_objectid,
1891                             ref_generation, owner_objectid, owner_offset,
1892                             pin, pin == 0);
1893
1894         finish_current_insert(trans, root->fs_info->extent_root);
1895         pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
1896         return ret ? ret : pending_ret;
1897 }
1898
1899 int btrfs_free_extent(struct btrfs_trans_handle *trans,
1900                       struct btrfs_root *root, u64 bytenr,
1901                       u64 num_bytes, u64 root_objectid,
1902                       u64 ref_generation, u64 owner_objectid,
1903                       u64 owner_offset, int pin)
1904 {
1905         int ret;
1906
1907         maybe_lock_mutex(root);
1908         ret = __btrfs_free_extent(trans, root, bytenr, num_bytes,
1909                                   root_objectid, ref_generation,
1910                                   owner_objectid, owner_offset, pin);
1911         maybe_unlock_mutex(root);
1912         return ret;
1913 }
1914
1915 static u64 stripe_align(struct btrfs_root *root, u64 val)
1916 {
1917         u64 mask = ((u64)root->stripesize - 1);
1918         u64 ret = (val + mask) & ~mask;
1919         return ret;
1920 }
1921
1922 /*
1923  * walks the btree of allocated extents and find a hole of a given size.
1924  * The key ins is changed to record the hole:
1925  * ins->objectid == block start
1926  * ins->flags = BTRFS_EXTENT_ITEM_KEY
1927  * ins->offset == number of blocks
1928  * Any available blocks before search_start are skipped.
1929  */
1930 static int noinline find_free_extent(struct btrfs_trans_handle *trans,
1931                                      struct btrfs_root *orig_root,
1932                                      u64 num_bytes, u64 empty_size,
1933                                      u64 search_start, u64 search_end,
1934                                      u64 hint_byte, struct btrfs_key *ins,
1935                                      u64 exclude_start, u64 exclude_nr,
1936                                      int data)
1937 {
1938         int ret;
1939         u64 orig_search_start;
1940         struct btrfs_root * root = orig_root->fs_info->extent_root;
1941         struct btrfs_fs_info *info = root->fs_info;
1942         u64 total_needed = num_bytes;
1943         u64 *last_ptr = NULL;
1944         struct btrfs_block_group_cache *block_group;
1945         int full_scan = 0;
1946         int wrapped = 0;
1947         int chunk_alloc_done = 0;
1948         int empty_cluster = 2 * 1024 * 1024;
1949         int allowed_chunk_alloc = 0;
1950
1951         WARN_ON(num_bytes < root->sectorsize);
1952         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
1953
1954         if (orig_root->ref_cows || empty_size)
1955                 allowed_chunk_alloc = 1;
1956
1957         if (data & BTRFS_BLOCK_GROUP_METADATA) {
1958                 last_ptr = &root->fs_info->last_alloc;
1959                 empty_cluster = 256 * 1024;
1960         }
1961
1962         if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
1963                 last_ptr = &root->fs_info->last_data_alloc;
1964         }
1965         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
1966                 last_ptr = &root->fs_info->last_log_alloc;
1967                 if (!last_ptr == 0 && root->fs_info->last_alloc) {
1968                         *last_ptr = root->fs_info->last_alloc + empty_cluster;
1969                 }
1970         }
1971
1972         if (last_ptr) {
1973                 if (*last_ptr)
1974                         hint_byte = *last_ptr;
1975                 else {
1976                         empty_size += empty_cluster;
1977                 }
1978         }
1979
1980         search_start = max(search_start, first_logical_byte(root, 0));
1981         orig_search_start = search_start;
1982
1983         if (search_end == (u64)-1)
1984                 search_end = btrfs_super_total_bytes(&info->super_copy);
1985
1986         if (hint_byte) {
1987                 block_group = btrfs_lookup_first_block_group(info, hint_byte);
1988                 if (!block_group)
1989                         hint_byte = search_start;
1990                 block_group = btrfs_find_block_group(root, block_group,
1991                                                      hint_byte, data, 1);
1992                 if (last_ptr && *last_ptr == 0 && block_group)
1993                         hint_byte = block_group->key.objectid;
1994         } else {
1995                 block_group = btrfs_find_block_group(root,
1996                                                      trans->block_group,
1997                                                      search_start, data, 1);
1998         }
1999         search_start = max(search_start, hint_byte);
2000
2001         total_needed += empty_size;
2002
2003 check_failed:
2004         if (!block_group) {
2005                 block_group = btrfs_lookup_first_block_group(info,
2006                                                              search_start);
2007                 if (!block_group)
2008                         block_group = btrfs_lookup_first_block_group(info,
2009                                                        orig_search_start);
2010         }
2011         if (full_scan && !chunk_alloc_done) {
2012                 if (allowed_chunk_alloc) {
2013                         do_chunk_alloc(trans, root,
2014                                      num_bytes + 2 * 1024 * 1024, data, 1);
2015                         allowed_chunk_alloc = 0;
2016                 } else if (block_group && block_group_bits(block_group, data)) {
2017                         block_group->space_info->force_alloc = 1;
2018                 }
2019                 chunk_alloc_done = 1;
2020         }
2021         ret = find_search_start(root, &block_group, &search_start,
2022                                 total_needed, data);
2023         if (ret == -ENOSPC && last_ptr && *last_ptr) {
2024                 *last_ptr = 0;
2025                 block_group = btrfs_lookup_first_block_group(info,
2026                                                              orig_search_start);
2027                 search_start = orig_search_start;
2028                 ret = find_search_start(root, &block_group, &search_start,
2029                                         total_needed, data);
2030         }
2031         if (ret == -ENOSPC)
2032                 goto enospc;
2033         if (ret)
2034                 goto error;
2035
2036         if (last_ptr && *last_ptr && search_start != *last_ptr) {
2037                 *last_ptr = 0;
2038                 if (!empty_size) {
2039                         empty_size += empty_cluster;
2040                         total_needed += empty_size;
2041                 }
2042                 block_group = btrfs_lookup_first_block_group(info,
2043                                                        orig_search_start);
2044                 search_start = orig_search_start;
2045                 ret = find_search_start(root, &block_group,
2046                                         &search_start, total_needed, data);
2047                 if (ret == -ENOSPC)
2048                         goto enospc;
2049                 if (ret)
2050                         goto error;
2051         }
2052
2053         search_start = stripe_align(root, search_start);
2054         ins->objectid = search_start;
2055         ins->offset = num_bytes;
2056
2057         if (ins->objectid + num_bytes >= search_end)
2058                 goto enospc;
2059
2060         if (ins->objectid + num_bytes >
2061             block_group->key.objectid + block_group->key.offset) {
2062                 search_start = block_group->key.objectid +
2063                         block_group->key.offset;
2064                 goto new_group;
2065         }
2066
2067         if (test_range_bit(&info->extent_ins, ins->objectid,
2068                            ins->objectid + num_bytes -1, EXTENT_LOCKED, 0)) {
2069                 search_start = ins->objectid + num_bytes;
2070                 goto new_group;
2071         }
2072
2073         if (test_range_bit(&info->pinned_extents, ins->objectid,
2074                            ins->objectid + num_bytes -1, EXTENT_DIRTY, 0)) {
2075                 search_start = ins->objectid + num_bytes;
2076                 goto new_group;
2077         }
2078
2079         if (exclude_nr > 0 && (ins->objectid + num_bytes > exclude_start &&
2080             ins->objectid < exclude_start + exclude_nr)) {
2081                 search_start = exclude_start + exclude_nr;
2082                 goto new_group;
2083         }
2084
2085         if (!(data & BTRFS_BLOCK_GROUP_DATA)) {
2086                 block_group = btrfs_lookup_block_group(info, ins->objectid);
2087                 if (block_group)
2088                         trans->block_group = block_group;
2089         }
2090         ins->offset = num_bytes;
2091         if (last_ptr) {
2092                 *last_ptr = ins->objectid + ins->offset;
2093                 if (*last_ptr ==
2094                     btrfs_super_total_bytes(&root->fs_info->super_copy)) {
2095                         *last_ptr = 0;
2096                 }
2097         }
2098         return 0;
2099
2100 new_group:
2101         if (search_start + num_bytes >= search_end) {
2102 enospc:
2103                 search_start = orig_search_start;
2104                 if (full_scan) {
2105                         ret = -ENOSPC;
2106                         goto error;
2107                 }
2108                 if (wrapped) {
2109                         if (!full_scan)
2110                                 total_needed -= empty_size;
2111                         full_scan = 1;
2112                 } else
2113                         wrapped = 1;
2114         }
2115         block_group = btrfs_lookup_first_block_group(info, search_start);
2116         cond_resched();
2117         block_group = btrfs_find_block_group(root, block_group,
2118                                              search_start, data, 0);
2119         goto check_failed;
2120
2121 error:
2122         return ret;
2123 }
2124
2125 static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2126                                   struct btrfs_root *root,
2127                                   u64 num_bytes, u64 min_alloc_size,
2128                                   u64 empty_size, u64 hint_byte,
2129                                   u64 search_end, struct btrfs_key *ins,
2130                                   u64 data)
2131 {
2132         int ret;
2133         u64 search_start = 0;
2134         u64 alloc_profile;
2135         struct btrfs_fs_info *info = root->fs_info;
2136
2137         if (data) {
2138                 alloc_profile = info->avail_data_alloc_bits &
2139                                 info->data_alloc_profile;
2140                 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
2141         } else if (root == root->fs_info->chunk_root) {
2142                 alloc_profile = info->avail_system_alloc_bits &
2143                                 info->system_alloc_profile;
2144                 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2145         } else {
2146                 alloc_profile = info->avail_metadata_alloc_bits &
2147                                 info->metadata_alloc_profile;
2148                 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2149         }
2150 again:
2151         data = reduce_alloc_profile(root, data);
2152         /*
2153          * the only place that sets empty_size is btrfs_realloc_node, which
2154          * is not called recursively on allocations
2155          */
2156         if (empty_size || root->ref_cows) {
2157                 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
2158                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2159                                      2 * 1024 * 1024,
2160                                      BTRFS_BLOCK_GROUP_METADATA |
2161                                      (info->metadata_alloc_profile &
2162                                       info->avail_metadata_alloc_bits), 0);
2163                         BUG_ON(ret);
2164                 }
2165                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2166                                      num_bytes + 2 * 1024 * 1024, data, 0);
2167                 BUG_ON(ret);
2168         }
2169
2170         WARN_ON(num_bytes < root->sectorsize);
2171         ret = find_free_extent(trans, root, num_bytes, empty_size,
2172                                search_start, search_end, hint_byte, ins,
2173                                trans->alloc_exclude_start,
2174                                trans->alloc_exclude_nr, data);
2175
2176         if (ret == -ENOSPC && num_bytes > min_alloc_size) {
2177                 num_bytes = num_bytes >> 1;
2178                 num_bytes = max(num_bytes, min_alloc_size);
2179                 do_chunk_alloc(trans, root->fs_info->extent_root,
2180                                num_bytes, data, 1);
2181                 goto again;
2182         }
2183         if (ret) {
2184                 printk("allocation failed flags %Lu\n", data);
2185                 BUG();
2186         }
2187         clear_extent_dirty(&root->fs_info->free_space_cache,
2188                            ins->objectid, ins->objectid + ins->offset - 1,
2189                            GFP_NOFS);
2190         return 0;
2191 }
2192
2193 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
2194 {
2195         maybe_lock_mutex(root);
2196         set_extent_dirty(&root->fs_info->free_space_cache,
2197                          start, start + len - 1, GFP_NOFS);
2198         maybe_unlock_mutex(root);
2199         return 0;
2200 }
2201
2202 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2203                                   struct btrfs_root *root,
2204                                   u64 num_bytes, u64 min_alloc_size,
2205                                   u64 empty_size, u64 hint_byte,
2206                                   u64 search_end, struct btrfs_key *ins,
2207                                   u64 data)
2208 {
2209         int ret;
2210         maybe_lock_mutex(root);
2211         ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
2212                                      empty_size, hint_byte, search_end, ins,
2213                                      data);
2214         maybe_unlock_mutex(root);
2215         return ret;
2216 }
2217
2218 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
2219                                          struct btrfs_root *root,
2220                                          u64 root_objectid, u64 ref_generation,
2221                                          u64 owner, u64 owner_offset,
2222                                          struct btrfs_key *ins)
2223 {
2224         int ret;
2225         int pending_ret;
2226         u64 super_used;
2227         u64 root_used;
2228         u64 num_bytes = ins->offset;
2229         u32 sizes[2];
2230         struct btrfs_fs_info *info = root->fs_info;
2231         struct btrfs_root *extent_root = info->extent_root;
2232         struct btrfs_extent_item *extent_item;
2233         struct btrfs_extent_ref *ref;
2234         struct btrfs_path *path;
2235         struct btrfs_key keys[2];
2236
2237         /* block accounting for super block */
2238         spin_lock_irq(&info->delalloc_lock);
2239         super_used = btrfs_super_bytes_used(&info->super_copy);
2240         btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
2241         spin_unlock_irq(&info->delalloc_lock);
2242
2243         /* block accounting for root item */
2244         root_used = btrfs_root_used(&root->root_item);
2245         btrfs_set_root_used(&root->root_item, root_used + num_bytes);
2246
2247         if (root == extent_root) {
2248                 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
2249                                 ins->objectid + ins->offset - 1,
2250                                 EXTENT_LOCKED, GFP_NOFS);
2251                 goto update_block;
2252         }
2253
2254         memcpy(&keys[0], ins, sizeof(*ins));
2255         keys[1].offset = hash_extent_ref(root_objectid, ref_generation,
2256                                          owner, owner_offset);
2257         keys[1].objectid = ins->objectid;
2258         keys[1].type = BTRFS_EXTENT_REF_KEY;
2259         sizes[0] = sizeof(*extent_item);
2260         sizes[1] = sizeof(*ref);
2261
2262         path = btrfs_alloc_path();
2263         BUG_ON(!path);
2264
2265         ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
2266                                        sizes, 2);
2267
2268         BUG_ON(ret);
2269         extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2270                                      struct btrfs_extent_item);
2271         btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
2272         ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
2273                              struct btrfs_extent_ref);
2274
2275         btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
2276         btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
2277         btrfs_set_ref_objectid(path->nodes[0], ref, owner);
2278         btrfs_set_ref_offset(path->nodes[0], ref, owner_offset);
2279
2280         btrfs_mark_buffer_dirty(path->nodes[0]);
2281
2282         trans->alloc_exclude_start = 0;
2283         trans->alloc_exclude_nr = 0;
2284         btrfs_free_path(path);
2285         finish_current_insert(trans, extent_root);
2286         pending_ret = del_pending_extents(trans, extent_root);
2287
2288         if (ret)
2289                 goto out;
2290         if (pending_ret) {
2291                 ret = pending_ret;
2292                 goto out;
2293         }
2294
2295 update_block:
2296         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0);
2297         if (ret) {
2298                 printk("update block group failed for %Lu %Lu\n",
2299                        ins->objectid, ins->offset);
2300                 BUG();
2301         }
2302 out:
2303         return ret;
2304 }
2305
2306 int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
2307                                 struct btrfs_root *root,
2308                                 u64 root_objectid, u64 ref_generation,
2309                                 u64 owner, u64 owner_offset,
2310                                 struct btrfs_key *ins)
2311 {
2312         int ret;
2313         maybe_lock_mutex(root);
2314         ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
2315                                             ref_generation, owner,
2316                                             owner_offset, ins);
2317         maybe_unlock_mutex(root);
2318         return ret;
2319 }
2320
2321 /*
2322  * this is used by the tree logging recovery code.  It records that
2323  * an extent has been allocated and makes sure to clear the free
2324  * space cache bits as well
2325  */
2326 int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
2327                                 struct btrfs_root *root,
2328                                 u64 root_objectid, u64 ref_generation,
2329                                 u64 owner, u64 owner_offset,
2330                                 struct btrfs_key *ins)
2331 {
2332         int ret;
2333         struct btrfs_block_group_cache *block_group;
2334
2335         maybe_lock_mutex(root);
2336         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
2337         cache_block_group(root, block_group);
2338
2339         clear_extent_dirty(&root->fs_info->free_space_cache,
2340                            ins->objectid, ins->objectid + ins->offset - 1,
2341                            GFP_NOFS);
2342         ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
2343                                             ref_generation, owner,
2344                                             owner_offset, ins);
2345         maybe_unlock_mutex(root);
2346         return ret;
2347 }
2348
2349 /*
2350  * finds a free extent and does all the dirty work required for allocation
2351  * returns the key for the extent through ins, and a tree buffer for
2352  * the first block of the extent through buf.
2353  *
2354  * returns 0 if everything worked, non-zero otherwise.
2355  */
2356 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
2357                        struct btrfs_root *root,
2358                        u64 num_bytes, u64 min_alloc_size,
2359                        u64 root_objectid, u64 ref_generation,
2360                        u64 owner, u64 owner_offset,
2361                        u64 empty_size, u64 hint_byte,
2362                        u64 search_end, struct btrfs_key *ins, u64 data)
2363 {
2364         int ret;
2365
2366         maybe_lock_mutex(root);
2367
2368         ret = __btrfs_reserve_extent(trans, root, num_bytes,
2369                                      min_alloc_size, empty_size, hint_byte,
2370                                      search_end, ins, data);
2371         BUG_ON(ret);
2372         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
2373                 ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
2374                                                     ref_generation, owner,
2375                                                     owner_offset, ins);
2376                 BUG_ON(ret);
2377
2378         }
2379         maybe_unlock_mutex(root);
2380         return ret;
2381 }
2382
2383 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
2384                                             struct btrfs_root *root,
2385                                             u64 bytenr, u32 blocksize)
2386 {
2387         struct extent_buffer *buf;
2388
2389         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
2390         if (!buf)
2391                 return ERR_PTR(-ENOMEM);
2392         btrfs_set_header_generation(buf, trans->transid);
2393         btrfs_tree_lock(buf);
2394         clean_tree_block(trans, root, buf);
2395         btrfs_set_buffer_uptodate(buf);
2396         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2397                 set_extent_dirty(&root->dirty_log_pages, buf->start,
2398                          buf->start + buf->len - 1, GFP_NOFS);
2399         } else {
2400                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
2401                          buf->start + buf->len - 1, GFP_NOFS);
2402         }
2403         trans->blocks_used++;
2404         return buf;
2405 }
2406
2407 /*
2408  * helper function to allocate a block for a given tree
2409  * returns the tree buffer or NULL.
2410  */
2411 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
2412                                              struct btrfs_root *root,
2413                                              u32 blocksize,
2414                                              u64 root_objectid,
2415                                              u64 ref_generation,
2416                                              u64 first_objectid,
2417                                              int level,
2418                                              u64 hint,
2419                                              u64 empty_size)
2420 {
2421         struct btrfs_key ins;
2422         int ret;
2423         struct extent_buffer *buf;
2424
2425         ret = btrfs_alloc_extent(trans, root, blocksize, blocksize,
2426                                  root_objectid, ref_generation,
2427                                  level, first_objectid, empty_size, hint,
2428                                  (u64)-1, &ins, 0);
2429         if (ret) {
2430                 BUG_ON(ret > 0);
2431                 return ERR_PTR(ret);
2432         }
2433
2434         buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
2435         return buf;
2436 }
2437
2438 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
2439                         struct btrfs_root *root, struct extent_buffer *leaf)
2440 {
2441         u64 leaf_owner;
2442         u64 leaf_generation;
2443         struct btrfs_key key;
2444         struct btrfs_file_extent_item *fi;
2445         int i;
2446         int nritems;
2447         int ret;
2448
2449         BUG_ON(!btrfs_is_leaf(leaf));
2450         nritems = btrfs_header_nritems(leaf);
2451         leaf_owner = btrfs_header_owner(leaf);
2452         leaf_generation = btrfs_header_generation(leaf);
2453
2454         for (i = 0; i < nritems; i++) {
2455                 u64 disk_bytenr;
2456                 cond_resched();
2457
2458                 btrfs_item_key_to_cpu(leaf, &key, i);
2459                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2460                         continue;
2461                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2462                 if (btrfs_file_extent_type(leaf, fi) ==
2463                     BTRFS_FILE_EXTENT_INLINE)
2464                         continue;
2465                 /*
2466                  * FIXME make sure to insert a trans record that
2467                  * repeats the snapshot del on crash
2468                  */
2469                 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2470                 if (disk_bytenr == 0)
2471                         continue;
2472
2473                 mutex_lock(&root->fs_info->alloc_mutex);
2474                 ret = __btrfs_free_extent(trans, root, disk_bytenr,
2475                                 btrfs_file_extent_disk_num_bytes(leaf, fi),
2476                                 leaf_owner, leaf_generation,
2477                                 key.objectid, key.offset, 0);
2478                 mutex_unlock(&root->fs_info->alloc_mutex);
2479
2480                 atomic_inc(&root->fs_info->throttle_gen);
2481                 wake_up(&root->fs_info->transaction_throttle);
2482                 cond_resched();
2483
2484                 BUG_ON(ret);
2485         }
2486         return 0;
2487 }
2488
2489 static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
2490                                         struct btrfs_root *root,
2491                                         struct btrfs_leaf_ref *ref)
2492 {
2493         int i;
2494         int ret;
2495         struct btrfs_extent_info *info = ref->extents;
2496
2497         for (i = 0; i < ref->nritems; i++) {
2498                 mutex_lock(&root->fs_info->alloc_mutex);
2499                 ret = __btrfs_free_extent(trans, root,
2500                                         info->bytenr, info->num_bytes,
2501                                         ref->owner, ref->generation,
2502                                         info->objectid, info->offset, 0);
2503                 mutex_unlock(&root->fs_info->alloc_mutex);
2504
2505                 atomic_inc(&root->fs_info->throttle_gen);
2506                 wake_up(&root->fs_info->transaction_throttle);
2507                 cond_resched();
2508
2509                 BUG_ON(ret);
2510                 info++;
2511         }
2512
2513         return 0;
2514 }
2515
2516 int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
2517                               u32 *refs)
2518 {
2519         int ret;
2520
2521         ret = lookup_extent_ref(NULL, root, start, len, refs);
2522         BUG_ON(ret);
2523
2524 #if 0 // some debugging code in case we see problems here
2525         /* if the refs count is one, it won't get increased again.  But
2526          * if the ref count is > 1, someone may be decreasing it at
2527          * the same time we are.
2528          */
2529         if (*refs != 1) {
2530                 struct extent_buffer *eb = NULL;
2531                 eb = btrfs_find_create_tree_block(root, start, len);
2532                 if (eb)
2533                         btrfs_tree_lock(eb);
2534
2535                 mutex_lock(&root->fs_info->alloc_mutex);
2536                 ret = lookup_extent_ref(NULL, root, start, len, refs);
2537                 BUG_ON(ret);
2538                 mutex_unlock(&root->fs_info->alloc_mutex);
2539
2540                 if (eb) {
2541                         btrfs_tree_unlock(eb);
2542                         free_extent_buffer(eb);
2543                 }
2544                 if (*refs == 1) {
2545                         printk("block %llu went down to one during drop_snap\n",
2546                                (unsigned long long)start);
2547                 }
2548
2549         }
2550 #endif
2551
2552         cond_resched();
2553         return ret;
2554 }
2555
2556 /*
2557  * helper function for drop_snapshot, this walks down the tree dropping ref
2558  * counts as it goes.
2559  */
2560 static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
2561                                    struct btrfs_root *root,
2562                                    struct btrfs_path *path, int *level)
2563 {
2564         u64 root_owner;
2565         u64 root_gen;
2566         u64 bytenr;
2567         u64 ptr_gen;
2568         struct extent_buffer *next;
2569         struct extent_buffer *cur;
2570         struct extent_buffer *parent;
2571         struct btrfs_leaf_ref *ref;
2572         u32 blocksize;
2573         int ret;
2574         u32 refs;
2575
2576         WARN_ON(*level < 0);
2577         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2578         ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
2579                                 path->nodes[*level]->len, &refs);
2580         BUG_ON(ret);
2581         if (refs > 1)
2582                 goto out;
2583
2584         /*
2585          * walk down to the last node level and free all the leaves
2586          */
2587         while(*level >= 0) {
2588                 WARN_ON(*level < 0);
2589                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2590                 cur = path->nodes[*level];
2591
2592                 if (btrfs_header_level(cur) != *level)
2593                         WARN_ON(1);
2594
2595                 if (path->slots[*level] >=
2596                     btrfs_header_nritems(cur))
2597                         break;
2598                 if (*level == 0) {
2599                         ret = btrfs_drop_leaf_ref(trans, root, cur);
2600                         BUG_ON(ret);
2601                         break;
2602                 }
2603                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2604                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2605                 blocksize = btrfs_level_size(root, *level - 1);
2606
2607                 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
2608                 BUG_ON(ret);
2609                 if (refs != 1) {
2610                         parent = path->nodes[*level];
2611                         root_owner = btrfs_header_owner(parent);
2612                         root_gen = btrfs_header_generation(parent);
2613                         path->slots[*level]++;
2614
2615                         mutex_lock(&root->fs_info->alloc_mutex);
2616                         ret = __btrfs_free_extent(trans, root, bytenr,
2617                                                 blocksize, root_owner,
2618                                                 root_gen, 0, 0, 1);
2619                         BUG_ON(ret);
2620                         mutex_unlock(&root->fs_info->alloc_mutex);
2621
2622                         atomic_inc(&root->fs_info->throttle_gen);
2623                         wake_up(&root->fs_info->transaction_throttle);
2624                         cond_resched();
2625
2626                         continue;
2627                 }
2628                 /*
2629                  * at this point, we have a single ref, and since the
2630                  * only place referencing this extent is a dead root
2631                  * the reference count should never go higher.
2632                  * So, we don't need to check it again
2633                  */
2634                 if (*level == 1) {
2635                         struct btrfs_key key;
2636                         btrfs_node_key_to_cpu(cur, &key, path->slots[*level]);
2637                         ref = btrfs_lookup_leaf_ref(root, bytenr);
2638                         if (ref) {
2639                                 ret = cache_drop_leaf_ref(trans, root, ref);
2640                                 BUG_ON(ret);
2641                                 btrfs_remove_leaf_ref(root, ref);
2642                                 btrfs_free_leaf_ref(root, ref);
2643                                 *level = 0;
2644                                 break;
2645                         }
2646                         if (printk_ratelimit())
2647                                 printk("leaf ref miss for bytenr %llu\n",
2648                                        (unsigned long long)bytenr);
2649                 }
2650                 next = btrfs_find_tree_block(root, bytenr, blocksize);
2651                 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
2652                         free_extent_buffer(next);
2653
2654                         next = read_tree_block(root, bytenr, blocksize,
2655                                                ptr_gen);
2656                         cond_resched();
2657 #if 0
2658                         /*
2659                          * this is a debugging check and can go away
2660                          * the ref should never go all the way down to 1
2661                          * at this point
2662                          */
2663                         ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
2664                                                 &refs);
2665                         BUG_ON(ret);
2666                         WARN_ON(refs != 1);
2667 #endif
2668                 }
2669                 WARN_ON(*level <= 0);
2670                 if (path->nodes[*level-1])
2671                         free_extent_buffer(path->nodes[*level-1]);
2672                 path->nodes[*level-1] = next;
2673                 *level = btrfs_header_level(next);
2674                 path->slots[*level] = 0;
2675                 cond_resched();
2676         }
2677 out:
2678         WARN_ON(*level < 0);
2679         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2680
2681         if (path->nodes[*level] == root->node) {
2682                 parent = path->nodes[*level];
2683                 bytenr = path->nodes[*level]->start;
2684         } else {
2685                 parent = path->nodes[*level + 1];
2686                 bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
2687         }
2688
2689         blocksize = btrfs_level_size(root, *level);
2690         root_owner = btrfs_header_owner(parent);
2691         root_gen = btrfs_header_generation(parent);
2692
2693         mutex_lock(&root->fs_info->alloc_mutex);
2694         ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
2695                                   root_owner, root_gen, 0, 0, 1);
2696         free_extent_buffer(path->nodes[*level]);
2697         path->nodes[*level] = NULL;
2698         *level += 1;
2699         BUG_ON(ret);
2700         mutex_unlock(&root->fs_info->alloc_mutex);
2701
2702         cond_resched();
2703         return 0;
2704 }
2705
2706 /*
2707  * helper for dropping snapshots.  This walks back up the tree in the path
2708  * to find the first node higher up where we haven't yet gone through
2709  * all the slots
2710  */
2711 static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
2712                                  struct btrfs_root *root,
2713                                  struct btrfs_path *path, int *level)
2714 {
2715         u64 root_owner;
2716         u64 root_gen;
2717         struct btrfs_root_item *root_item = &root->root_item;
2718         int i;
2719         int slot;
2720         int ret;
2721
2722         for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2723                 slot = path->slots[i];
2724                 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
2725                         struct extent_buffer *node;
2726                         struct btrfs_disk_key disk_key;
2727                         node = path->nodes[i];
2728                         path->slots[i]++;
2729                         *level = i;
2730                         WARN_ON(*level == 0);
2731                         btrfs_node_key(node, &disk_key, path->slots[i]);
2732                         memcpy(&root_item->drop_progress,
2733                                &disk_key, sizeof(disk_key));
2734                         root_item->drop_level = i;
2735                         return 0;
2736                 } else {
2737                         if (path->nodes[*level] == root->node) {
2738                                 root_owner = root->root_key.objectid;
2739                                 root_gen =
2740                                    btrfs_header_generation(path->nodes[*level]);
2741                         } else {
2742                                 struct extent_buffer *node;
2743                                 node = path->nodes[*level + 1];
2744                                 root_owner = btrfs_header_owner(node);
2745                                 root_gen = btrfs_header_generation(node);
2746                         }
2747                         ret = btrfs_free_extent(trans, root,
2748                                                 path->nodes[*level]->start,
2749                                                 path->nodes[*level]->len,
2750                                                 root_owner, root_gen, 0, 0, 1);
2751                         BUG_ON(ret);
2752                         free_extent_buffer(path->nodes[*level]);
2753                         path->nodes[*level] = NULL;
2754                         *level = i + 1;
2755                 }
2756         }
2757         return 1;
2758 }
2759
2760 /*
2761  * drop the reference count on the tree rooted at 'snap'.  This traverses
2762  * the tree freeing any blocks that have a ref count of zero after being
2763  * decremented.
2764  */
2765 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
2766                         *root)
2767 {
2768         int ret = 0;
2769         int wret;
2770         int level;
2771         struct btrfs_path *path;
2772         int i;
2773         int orig_level;
2774         struct btrfs_root_item *root_item = &root->root_item;
2775
2776         WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
2777         path = btrfs_alloc_path();
2778         BUG_ON(!path);
2779
2780         level = btrfs_header_level(root->node);
2781         orig_level = level;
2782         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2783                 path->nodes[level] = root->node;
2784                 extent_buffer_get(root->node);
2785                 path->slots[level] = 0;
2786         } else {
2787                 struct btrfs_key key;
2788                 struct btrfs_disk_key found_key;
2789                 struct extent_buffer *node;
2790
2791                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2792                 level = root_item->drop_level;
2793                 path->lowest_level = level;
2794                 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2795                 if (wret < 0) {
2796                         ret = wret;
2797                         goto out;
2798                 }
2799                 node = path->nodes[level];
2800                 btrfs_node_key(node, &found_key, path->slots[level]);
2801                 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
2802                                sizeof(found_key)));
2803                 /*
2804                  * unlock our path, this is safe because only this
2805                  * function is allowed to delete this snapshot
2806                  */
2807                 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
2808                         if (path->nodes[i] && path->locks[i]) {
2809                                 path->locks[i] = 0;
2810                                 btrfs_tree_unlock(path->nodes[i]);
2811                         }
2812                 }
2813         }
2814         while(1) {
2815                 wret = walk_down_tree(trans, root, path, &level);
2816                 if (wret > 0)
2817                         break;
2818                 if (wret < 0)
2819                         ret = wret;
2820
2821                 wret = walk_up_tree(trans, root, path, &level);
2822                 if (wret > 0)
2823                         break;
2824                 if (wret < 0)
2825                         ret = wret;
2826                 if (trans->transaction->in_commit) {
2827                         ret = -EAGAIN;
2828                         break;
2829                 }
2830                 atomic_inc(&root->fs_info->throttle_gen);
2831                 wake_up(&root->fs_info->transaction_throttle);
2832         }
2833         for (i = 0; i <= orig_level; i++) {
2834                 if (path->nodes[i]) {
2835                         free_extent_buffer(path->nodes[i]);
2836                         path->nodes[i] = NULL;
2837                 }
2838         }
2839 out:
2840         btrfs_free_path(path);
2841         return ret;
2842 }
2843
2844 int btrfs_free_block_groups(struct btrfs_fs_info *info)
2845 {
2846         u64 start;
2847         u64 end;
2848         u64 ptr;
2849         int ret;
2850
2851         mutex_lock(&info->alloc_mutex);
2852         while(1) {
2853                 ret = find_first_extent_bit(&info->block_group_cache, 0,
2854                                             &start, &end, (unsigned int)-1);
2855                 if (ret)
2856                         break;
2857                 ret = get_state_private(&info->block_group_cache, start, &ptr);
2858                 if (!ret)
2859                         kfree((void *)(unsigned long)ptr);
2860                 clear_extent_bits(&info->block_group_cache, start,
2861                                   end, (unsigned int)-1, GFP_NOFS);
2862         }
2863         while(1) {
2864                 ret = find_first_extent_bit(&info->free_space_cache, 0,
2865                                             &start, &end, EXTENT_DIRTY);
2866                 if (ret)
2867                         break;
2868                 clear_extent_dirty(&info->free_space_cache, start,
2869                                    end, GFP_NOFS);
2870         }
2871         mutex_unlock(&info->alloc_mutex);
2872         return 0;
2873 }
2874
2875 static unsigned long calc_ra(unsigned long start, unsigned long last,
2876                              unsigned long nr)
2877 {
2878         return min(last, start + nr - 1);
2879 }
2880
2881 static int noinline relocate_inode_pages(struct inode *inode, u64 start,
2882                                          u64 len)
2883 {
2884         u64 page_start;
2885         u64 page_end;
2886         unsigned long last_index;
2887         unsigned long i;
2888         struct page *page;
2889         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2890         struct file_ra_state *ra;
2891         unsigned long total_read = 0;
2892         unsigned long ra_pages;
2893         struct btrfs_ordered_extent *ordered;
2894         struct btrfs_trans_handle *trans;
2895
2896         ra = kzalloc(sizeof(*ra), GFP_NOFS);
2897
2898         mutex_lock(&inode->i_mutex);
2899         i = start >> PAGE_CACHE_SHIFT;
2900         last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
2901
2902         ra_pages = BTRFS_I(inode)->root->fs_info->bdi.ra_pages;
2903
2904         file_ra_state_init(ra, inode->i_mapping);
2905
2906         for (; i <= last_index; i++) {
2907                 if (total_read % ra_pages == 0) {
2908                         btrfs_force_ra(inode->i_mapping, ra, NULL, i,
2909                                        calc_ra(i, last_index, ra_pages));
2910                 }
2911                 total_read++;
2912 again:
2913                 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
2914                         goto truncate_racing;
2915                 page = grab_cache_page(inode->i_mapping, i);
2916                 if (!page) {
2917                         goto out_unlock;
2918                 }
2919                 if (!PageUptodate(page)) {
2920                         btrfs_readpage(NULL, page);
2921                         lock_page(page);
2922                         if (!PageUptodate(page)) {
2923                                 unlock_page(page);
2924                                 page_cache_release(page);
2925                                 goto out_unlock;
2926                         }
2927                 }
2928                 wait_on_page_writeback(page);
2929
2930                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2931                 page_end = page_start + PAGE_CACHE_SIZE - 1;
2932                 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2933
2934                 ordered = btrfs_lookup_ordered_extent(inode, page_start);
2935                 if (ordered) {
2936                         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2937                         unlock_page(page);
2938                         page_cache_release(page);
2939                         btrfs_start_ordered_extent(inode, ordered, 1);
2940                         btrfs_put_ordered_extent(ordered);
2941                         goto again;
2942                 }
2943                 set_page_extent_mapped(page);
2944
2945                 /*
2946                  * make sure page_mkwrite is called for this page if userland
2947                  * wants to change it from mmap
2948                  */
2949                 clear_page_dirty_for_io(page);
2950
2951                 btrfs_set_extent_delalloc(inode, page_start, page_end);
2952                 set_page_dirty(page);
2953
2954                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2955                 unlock_page(page);
2956                 page_cache_release(page);
2957         }
2958
2959 out_unlock:
2960         /* we have to start the IO in order to get the ordered extents
2961          * instantiated.  This allows the relocation to code to wait
2962          * for all the ordered extents to hit the disk.
2963          *
2964          * Otherwise, it would constantly loop over the same extents
2965          * because the old ones don't get deleted  until the IO is
2966          * started
2967          */
2968         btrfs_fdatawrite_range(inode->i_mapping, start, start + len - 1,
2969                                WB_SYNC_NONE);
2970         kfree(ra);
2971         trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
2972         if (trans) {
2973                 btrfs_end_transaction(trans, BTRFS_I(inode)->root);
2974                 mark_inode_dirty(inode);
2975         }
2976         mutex_unlock(&inode->i_mutex);
2977         return 0;
2978
2979 truncate_racing:
2980         vmtruncate(inode, inode->i_size);
2981         balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2982                                            total_read);
2983         goto out_unlock;
2984 }
2985
2986 /*
2987  * The back references tell us which tree holds a ref on a block,
2988  * but it is possible for the tree root field in the reference to
2989  * reflect the original root before a snapshot was made.  In this
2990  * case we should search through all the children of a given root
2991  * to find potential holders of references on a block.
2992  *
2993  * Instead, we do something a little less fancy and just search
2994  * all the roots for a given key/block combination.
2995  */
2996 static int find_root_for_ref(struct btrfs_root *root,
2997                              struct btrfs_path *path,
2998                              struct btrfs_key *key0,
2999                              int level,
3000                              int file_key,
3001                              struct btrfs_root **found_root,
3002                              u64 bytenr)
3003 {
3004         struct btrfs_key root_location;
3005         struct btrfs_root *cur_root = *found_root;
3006         struct btrfs_file_extent_item *file_extent;
3007         u64 root_search_start = BTRFS_FS_TREE_OBJECTID;
3008         u64 found_bytenr;
3009         int ret;
3010
3011         root_location.offset = (u64)-1;
3012         root_location.type = BTRFS_ROOT_ITEM_KEY;
3013         path->lowest_level = level;
3014         path->reada = 0;
3015         while(1) {
3016                 ret = btrfs_search_slot(NULL, cur_root, key0, path, 0, 0);
3017                 found_bytenr = 0;
3018                 if (ret == 0 && file_key) {
3019                         struct extent_buffer *leaf = path->nodes[0];
3020                         file_extent = btrfs_item_ptr(leaf, path->slots[0],
3021                                              struct btrfs_file_extent_item);
3022                         if (btrfs_file_extent_type(leaf, file_extent) ==
3023                             BTRFS_FILE_EXTENT_REG) {
3024                                 found_bytenr =
3025                                         btrfs_file_extent_disk_bytenr(leaf,
3026                                                                file_extent);
3027                        }
3028                 } else if (!file_key) {
3029                         if (path->nodes[level])
3030                                 found_bytenr = path->nodes[level]->start;
3031                 }
3032
3033                 btrfs_release_path(cur_root, path);
3034
3035                 if (found_bytenr == bytenr) {
3036                         *found_root = cur_root;
3037                         ret = 0;
3038                         goto out;
3039                 }
3040                 ret = btrfs_search_root(root->fs_info->tree_root,
3041                                         root_search_start, &root_search_start);
3042                 if (ret)
3043                         break;
3044
3045                 root_location.objectid = root_search_start;
3046                 cur_root = btrfs_read_fs_root_no_name(root->fs_info,
3047                                                       &root_location);
3048                 if (!cur_root) {
3049                         ret = 1;
3050                         break;
3051                 }
3052         }
3053 out:
3054         path->lowest_level = 0;
3055         return ret;
3056 }
3057
3058 /*
3059  * note, this releases the path
3060  */
3061 static int noinline relocate_one_reference(struct btrfs_root *extent_root,
3062                                   struct btrfs_path *path,
3063                                   struct btrfs_key *extent_key,
3064                                   u64 *last_file_objectid,
3065                                   u64 *last_file_offset,
3066                                   u64 *last_file_root,
3067                                   u64 last_extent)
3068 {
3069         struct inode *inode;
3070         struct btrfs_root *found_root;
3071         struct btrfs_key root_location;
3072         struct btrfs_key found_key;
3073         struct btrfs_extent_ref *ref;
3074         u64 ref_root;
3075         u64 ref_gen;
3076         u64 ref_objectid;
3077         u64 ref_offset;
3078         int ret;
3079         int level;
3080
3081         WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
3082
3083         ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
3084                              struct btrfs_extent_ref);
3085         ref_root = btrfs_ref_root(path->nodes[0], ref);
3086         ref_gen = btrfs_ref_generation(path->nodes[0], ref);
3087         ref_objectid = btrfs_ref_objectid(path->nodes[0], ref);
3088         ref_offset = btrfs_ref_offset(path->nodes[0], ref);
3089         btrfs_release_path(extent_root, path);
3090
3091         root_location.objectid = ref_root;
3092         if (ref_gen == 0)
3093                 root_location.offset = 0;
3094         else
3095                 root_location.offset = (u64)-1;
3096         root_location.type = BTRFS_ROOT_ITEM_KEY;
3097
3098         found_root = btrfs_read_fs_root_no_name(extent_root->fs_info,
3099                                                 &root_location);
3100         BUG_ON(!found_root);
3101         mutex_unlock(&extent_root->fs_info->alloc_mutex);
3102
3103         if (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
3104                 found_key.objectid = ref_objectid;
3105                 found_key.type = BTRFS_EXTENT_DATA_KEY;
3106                 found_key.offset = ref_offset;
3107                 level = 0;
3108
3109                 if (last_extent == extent_key->objectid &&
3110                     *last_file_objectid == ref_objectid &&
3111                     *last_file_offset == ref_offset &&
3112                     *last_file_root == ref_root)
3113                         goto out;
3114
3115                 ret = find_root_for_ref(extent_root, path, &found_key,
3116                                         level, 1, &found_root,
3117                                         extent_key->objectid);
3118
3119                 if (ret)
3120                         goto out;
3121
3122                 if (last_extent == extent_key->objectid &&
3123                     *last_file_objectid == ref_objectid &&
3124                     *last_file_offset == ref_offset &&
3125                     *last_file_root == ref_root)
3126                         goto out;
3127
3128                 inode = btrfs_iget_locked(extent_root->fs_info->sb,
3129                                           ref_objectid, found_root);
3130                 if (inode->i_state & I_NEW) {
3131                         /* the inode and parent dir are two different roots */
3132                         BTRFS_I(inode)->root = found_root;
3133                         BTRFS_I(inode)->location.objectid = ref_objectid;
3134                         BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
3135                         BTRFS_I(inode)->location.offset = 0;
3136                         btrfs_read_locked_inode(inode);
3137                         unlock_new_inode(inode);
3138
3139                 }
3140                 /* this can happen if the reference is not against
3141                  * the latest version of the tree root
3142                  */
3143                 if (is_bad_inode(inode))
3144                         goto out;
3145
3146                 *last_file_objectid = inode->i_ino;
3147                 *last_file_root = found_root->root_key.objectid;
3148                 *last_file_offset = ref_offset;
3149
3150                 relocate_inode_pages(inode, ref_offset, extent_key->offset);
3151                 iput(inode);
3152         } else {
3153                 struct btrfs_trans_handle *trans;
3154                 struct extent_buffer *eb;
3155                 int needs_lock = 0;
3156
3157                 eb = read_tree_block(found_root, extent_key->objectid,
3158                                      extent_key->offset, 0);
3159                 btrfs_tree_lock(eb);
3160                 level = btrfs_header_level(eb);
3161
3162                 if (level == 0)
3163                         btrfs_item_key_to_cpu(eb, &found_key, 0);
3164                 else
3165                         btrfs_node_key_to_cpu(eb, &found_key, 0);
3166
3167                 btrfs_tree_unlock(eb);
3168                 free_extent_buffer(eb);
3169
3170                 ret = find_root_for_ref(extent_root, path, &found_key,
3171                                         level, 0, &found_root,
3172                                         extent_key->objectid);
3173
3174                 if (ret)
3175                         goto out;
3176
3177                 /*
3178                  * right here almost anything could happen to our key,
3179                  * but that's ok.  The cow below will either relocate it
3180                  * or someone else will have relocated it.  Either way,
3181                  * it is in a different spot than it was before and
3182                  * we're happy.
3183                  */
3184
3185                 trans = btrfs_start_transaction(found_root, 1);
3186
3187                 if (found_root == extent_root->fs_info->extent_root ||
3188                     found_root == extent_root->fs_info->chunk_root ||
3189                     found_root == extent_root->fs_info->dev_root) {
3190                         needs_lock = 1;
3191                         mutex_lock(&extent_root->fs_info->alloc_mutex);
3192                 }
3193
3194                 path->lowest_level = level;
3195                 path->reada = 2;
3196                 ret = btrfs_search_slot(trans, found_root, &found_key, path,
3197                                         0, 1);
3198                 path->lowest_level = 0;
3199                 btrfs_release_path(found_root, path);
3200
3201                 if (found_root == found_root->fs_info->extent_root)
3202                         btrfs_extent_post_op(trans, found_root);
3203                 if (needs_lock)
3204                         mutex_unlock(&extent_root->fs_info->alloc_mutex);
3205
3206                 btrfs_end_transaction(trans, found_root);
3207
3208         }
3209 out:
3210         mutex_lock(&extent_root->fs_info->alloc_mutex);
3211         return 0;
3212 }
3213
3214 static int noinline del_extent_zero(struct btrfs_root *extent_root,
3215                                     struct btrfs_path *path,
3216                                     struct btrfs_key *extent_key)
3217 {
3218         int ret;
3219         struct btrfs_trans_handle *trans;
3220
3221         trans = btrfs_start_transaction(extent_root, 1);
3222         ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
3223         if (ret > 0) {
3224                 ret = -EIO;
3225                 goto out;
3226         }
3227         if (ret < 0)
3228                 goto out;
3229         ret = btrfs_del_item(trans, extent_root, path);
3230 out:
3231         btrfs_end_transaction(trans, extent_root);
3232         return ret;
3233 }
3234
3235 static int noinline relocate_one_extent(struct btrfs_root *extent_root,
3236                                         struct btrfs_path *path,
3237                                         struct btrfs_key *extent_key)
3238 {
3239         struct btrfs_key key;
3240         struct btrfs_key found_key;
3241         struct extent_buffer *leaf;
3242         u64 last_file_objectid = 0;
3243         u64 last_file_root = 0;
3244         u64 last_file_offset = (u64)-1;
3245         u64 last_extent = 0;
3246         u32 nritems;
3247         u32 item_size;
3248         int ret = 0;
3249
3250         if (extent_key->objectid == 0) {
3251                 ret = del_extent_zero(extent_root, path, extent_key);
3252                 goto out;
3253         }
3254         key.objectid = extent_key->objectid;
3255         key.type = BTRFS_EXTENT_REF_KEY;
3256         key.offset = 0;
3257
3258         while(1) {
3259                 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3260
3261                 if (ret < 0)
3262                         goto out;
3263
3264                 ret = 0;
3265                 leaf = path->nodes[0];
3266                 nritems = btrfs_header_nritems(leaf);
3267                 if (path->slots[0] == nritems) {
3268                         ret = btrfs_next_leaf(extent_root, path);
3269                         if (ret > 0) {
3270                                 ret = 0;
3271                                 goto out;
3272                         }
3273                         if (ret < 0)
3274                                 goto out;
3275                         leaf = path->nodes[0];
3276                 }
3277
3278                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3279                 if (found_key.objectid != extent_key->objectid) {
3280                         break;
3281                 }
3282
3283                 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
3284                         break;
3285                 }
3286
3287                 key.offset = found_key.offset + 1;
3288                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3289
3290                 ret = relocate_one_reference(extent_root, path, extent_key,
3291                                              &last_file_objectid,
3292                                              &last_file_offset,
3293                                              &last_file_root, last_extent);
3294                 if (ret)
3295                         goto out;
3296                 last_extent = extent_key->objectid;
3297         }
3298         ret = 0;
3299 out:
3300         btrfs_release_path(extent_root, path);
3301         return ret;
3302 }
3303
3304 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
3305 {
3306         u64 num_devices;
3307         u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
3308                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
3309
3310         num_devices = root->fs_info->fs_devices->num_devices;
3311         if (num_devices == 1) {
3312                 stripped |= BTRFS_BLOCK_GROUP_DUP;
3313                 stripped = flags & ~stripped;
3314
3315                 /* turn raid0 into single device chunks */
3316                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
3317                         return stripped;
3318
3319                 /* turn mirroring into duplication */
3320                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3321                              BTRFS_BLOCK_GROUP_RAID10))
3322                         return stripped | BTRFS_BLOCK_GROUP_DUP;
3323                 return flags;
3324         } else {
3325                 /* they already had raid on here, just return */
3326                 if (flags & stripped)
3327                         return flags;
3328
3329                 stripped |= BTRFS_BLOCK_GROUP_DUP;
3330                 stripped = flags & ~stripped;
3331
3332                 /* switch duplicated blocks with raid1 */
3333                 if (flags & BTRFS_BLOCK_GROUP_DUP)
3334                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
3335
3336                 /* turn single device chunks into raid0 */
3337                 return stripped | BTRFS_BLOCK_GROUP_RAID0;
3338         }
3339         return flags;
3340 }
3341
3342 int __alloc_chunk_for_shrink(struct btrfs_root *root,
3343                      struct btrfs_block_group_cache *shrink_block_group,
3344                      int force)
3345 {
3346         struct btrfs_trans_handle *trans;
3347         u64 new_alloc_flags;
3348         u64 calc;
3349
3350         spin_lock(&shrink_block_group->lock);
3351         if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
3352                 spin_unlock(&shrink_block_group->lock);
3353                 mutex_unlock(&root->fs_info->alloc_mutex);
3354
3355                 trans = btrfs_start_transaction(root, 1);
3356                 mutex_lock(&root->fs_info->alloc_mutex);
3357                 spin_lock(&shrink_block_group->lock);
3358
3359                 new_alloc_flags = update_block_group_flags(root,
3360                                                    shrink_block_group->flags);
3361                 if (new_alloc_flags != shrink_block_group->flags) {
3362                         calc =
3363                              btrfs_block_group_used(&shrink_block_group->item);
3364                 } else {
3365                         calc = shrink_block_group->key.offset;
3366                 }
3367                 spin_unlock(&shrink_block_group->lock);
3368
3369                 do_chunk_alloc(trans, root->fs_info->extent_root,
3370                                calc + 2 * 1024 * 1024, new_alloc_flags, force);
3371
3372                 mutex_unlock(&root->fs_info->alloc_mutex);
3373                 btrfs_end_transaction(trans, root);
3374                 mutex_lock(&root->fs_info->alloc_mutex);
3375         } else
3376                 spin_unlock(&shrink_block_group->lock);
3377         return 0;
3378 }
3379
3380 int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
3381 {
3382         struct btrfs_trans_handle *trans;
3383         struct btrfs_root *tree_root = root->fs_info->tree_root;
3384         struct btrfs_path *path;
3385         u64 cur_byte;
3386         u64 total_found;
3387         u64 shrink_last_byte;
3388         struct btrfs_block_group_cache *shrink_block_group;
3389         struct btrfs_fs_info *info = root->fs_info;
3390         struct btrfs_key key;
3391         struct btrfs_key found_key;
3392         struct extent_buffer *leaf;
3393         u32 nritems;
3394         int ret;
3395         int progress;
3396
3397         mutex_lock(&root->fs_info->alloc_mutex);
3398         shrink_block_group = btrfs_lookup_block_group(root->fs_info,
3399                                                       shrink_start);
3400         BUG_ON(!shrink_block_group);
3401
3402         shrink_last_byte = shrink_block_group->key.objectid +
3403                 shrink_block_group->key.offset;
3404
3405         shrink_block_group->space_info->total_bytes -=
3406                 shrink_block_group->key.offset;
3407         path = btrfs_alloc_path();
3408         root = root->fs_info->extent_root;
3409         path->reada = 2;
3410
3411         printk("btrfs relocating block group %llu flags %llu\n",
3412                (unsigned long long)shrink_start,
3413                (unsigned long long)shrink_block_group->flags);
3414
3415         __alloc_chunk_for_shrink(root, shrink_block_group, 1);
3416
3417 again:
3418
3419         shrink_block_group->ro = 1;
3420
3421         total_found = 0;
3422         progress = 0;
3423         key.objectid = shrink_start;
3424         key.offset = 0;
3425         key.type = 0;
3426         cur_byte = key.objectid;
3427
3428         mutex_unlock(&root->fs_info->alloc_mutex);
3429
3430         btrfs_start_delalloc_inodes(root);
3431         btrfs_wait_ordered_extents(tree_root, 0);
3432
3433         mutex_lock(&root->fs_info->alloc_mutex);
3434
3435         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3436         if (ret < 0)
3437                 goto out;
3438
3439         ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
3440         if (ret < 0)
3441                 goto out;
3442
3443         if (ret == 0) {
3444                 leaf = path->nodes[0];
3445                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3446                 if (found_key.objectid + found_key.offset > shrink_start &&
3447                     found_key.objectid < shrink_last_byte) {
3448                         cur_byte = found_key.objectid;
3449                         key.objectid = cur_byte;
3450                 }
3451         }
3452         btrfs_release_path(root, path);
3453
3454         while(1) {
3455                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3456                 if (ret < 0)
3457                         goto out;
3458
3459 next:
3460                 leaf = path->nodes[0];
3461                 nritems = btrfs_header_nritems(leaf);
3462                 if (path->slots[0] >= nritems) {
3463                         ret = btrfs_next_leaf(root, path);
3464                         if (ret < 0)
3465                                 goto out;
3466                         if (ret == 1) {
3467                                 ret = 0;
3468                                 break;
3469                         }
3470                         leaf = path->nodes[0];
3471                         nritems = btrfs_header_nritems(leaf);
3472                 }
3473
3474                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3475
3476                 if (found_key.objectid >= shrink_last_byte)
3477                         break;
3478
3479                 if (progress && need_resched()) {
3480                         memcpy(&key, &found_key, sizeof(key));
3481                         cond_resched();
3482                         btrfs_release_path(root, path);
3483                         btrfs_search_slot(NULL, root, &key, path, 0, 0);
3484                         progress = 0;
3485                         goto next;
3486                 }
3487                 progress = 1;
3488
3489                 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_ITEM_KEY ||
3490                     found_key.objectid + found_key.offset <= cur_byte) {
3491                         memcpy(&key, &found_key, sizeof(key));
3492                         key.offset++;
3493                         path->slots[0]++;
3494                         goto next;
3495                 }
3496
3497                 total_found++;
3498                 cur_byte = found_key.objectid + found_key.offset;
3499                 key.objectid = cur_byte;
3500                 btrfs_release_path(root, path);
3501                 ret = relocate_one_extent(root, path, &found_key);
3502                 __alloc_chunk_for_shrink(root, shrink_block_group, 0);
3503         }
3504
3505         btrfs_release_path(root, path);
3506
3507         if (total_found > 0) {
3508                 printk("btrfs relocate found %llu last extent was %llu\n",
3509                        (unsigned long long)total_found,
3510                        (unsigned long long)found_key.objectid);
3511                 mutex_unlock(&root->fs_info->alloc_mutex);
3512                 trans = btrfs_start_transaction(tree_root, 1);
3513                 btrfs_commit_transaction(trans, tree_root);
3514
3515                 btrfs_clean_old_snapshots(tree_root);
3516
3517                 btrfs_start_delalloc_inodes(root);
3518                 btrfs_wait_ordered_extents(tree_root, 0);
3519
3520                 trans = btrfs_start_transaction(tree_root, 1);
3521                 btrfs_commit_transaction(trans, tree_root);
3522                 mutex_lock(&root->fs_info->alloc_mutex);
3523                 goto again;
3524         }
3525
3526         /*
3527          * we've freed all the extents, now remove the block
3528          * group item from the tree
3529          */
3530         mutex_unlock(&root->fs_info->alloc_mutex);
3531
3532         trans = btrfs_start_transaction(root, 1);
3533
3534         mutex_lock(&root->fs_info->alloc_mutex);
3535         memcpy(&key, &shrink_block_group->key, sizeof(key));
3536
3537         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3538         if (ret > 0)
3539                 ret = -EIO;
3540         if (ret < 0) {
3541                 btrfs_end_transaction(trans, root);
3542                 goto out;
3543         }
3544
3545         clear_extent_bits(&info->block_group_cache, key.objectid,
3546                           key.objectid + key.offset - 1,
3547                           (unsigned int)-1, GFP_NOFS);
3548
3549
3550         clear_extent_bits(&info->free_space_cache,
3551                            key.objectid, key.objectid + key.offset - 1,
3552                            (unsigned int)-1, GFP_NOFS);
3553
3554         /*
3555         memset(shrink_block_group, 0, sizeof(*shrink_block_group));
3556         kfree(shrink_block_group);
3557         */
3558
3559         btrfs_del_item(trans, root, path);
3560         btrfs_release_path(root, path);
3561         mutex_unlock(&root->fs_info->alloc_mutex);
3562         btrfs_commit_transaction(trans, root);
3563
3564         mutex_lock(&root->fs_info->alloc_mutex);
3565
3566         /* the code to unpin extents might set a few bits in the free
3567          * space cache for this range again
3568          */
3569         clear_extent_bits(&info->free_space_cache,
3570                            key.objectid, key.objectid + key.offset - 1,
3571                            (unsigned int)-1, GFP_NOFS);
3572 out:
3573         btrfs_free_path(path);
3574         mutex_unlock(&root->fs_info->alloc_mutex);
3575         return ret;
3576 }
3577
3578 int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path,
3579                            struct btrfs_key *key)
3580 {
3581         int ret = 0;
3582         struct btrfs_key found_key;
3583         struct extent_buffer *leaf;
3584         int slot;
3585
3586         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
3587         if (ret < 0)
3588                 goto out;
3589
3590         while(1) {
3591                 slot = path->slots[0];
3592                 leaf = path->nodes[0];
3593                 if (slot >= btrfs_header_nritems(leaf)) {
3594                         ret = btrfs_next_leaf(root, path);
3595                         if (ret == 0)
3596                                 continue;
3597                         if (ret < 0)
3598                                 goto out;
3599                         break;
3600                 }
3601                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3602
3603                 if (found_key.objectid >= key->objectid &&
3604                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
3605                         ret = 0;
3606                         goto out;
3607                 }
3608                 path->slots[0]++;
3609         }
3610         ret = -ENOENT;
3611 out:
3612         return ret;
3613 }
3614
3615 int btrfs_read_block_groups(struct btrfs_root *root)
3616 {
3617         struct btrfs_path *path;
3618         int ret;
3619         int bit;
3620         struct btrfs_block_group_cache *cache;
3621         struct btrfs_fs_info *info = root->fs_info;
3622         struct btrfs_space_info *space_info;
3623         struct extent_io_tree *block_group_cache;
3624         struct btrfs_key key;
3625         struct btrfs_key found_key;
3626         struct extent_buffer *leaf;
3627
3628         block_group_cache = &info->block_group_cache;
3629         root = info->extent_root;
3630         key.objectid = 0;
3631         key.offset = 0;
3632         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
3633         path = btrfs_alloc_path();
3634         if (!path)
3635                 return -ENOMEM;
3636
3637         mutex_lock(&root->fs_info->alloc_mutex);
3638         while(1) {
3639                 ret = find_first_block_group(root, path, &key);
3640                 if (ret > 0) {
3641                         ret = 0;
3642                         goto error;
3643                 }
3644                 if (ret != 0)
3645                         goto error;
3646
3647                 leaf = path->nodes[0];
3648                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3649                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
3650                 if (!cache) {
3651                         ret = -ENOMEM;
3652                         break;
3653                 }
3654
3655                 spin_lock_init(&cache->lock);
3656                 read_extent_buffer(leaf, &cache->item,
3657                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
3658                                    sizeof(cache->item));
3659                 memcpy(&cache->key, &found_key, sizeof(found_key));
3660
3661                 key.objectid = found_key.objectid + found_key.offset;
3662                 btrfs_release_path(root, path);
3663                 cache->flags = btrfs_block_group_flags(&cache->item);
3664                 bit = 0;
3665                 if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
3666                         bit = BLOCK_GROUP_DATA;
3667                 } else if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
3668                         bit = BLOCK_GROUP_SYSTEM;
3669                 } else if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
3670                         bit = BLOCK_GROUP_METADATA;
3671                 }
3672                 set_avail_alloc_bits(info, cache->flags);
3673
3674                 ret = update_space_info(info, cache->flags, found_key.offset,
3675                                         btrfs_block_group_used(&cache->item),
3676                                         &space_info);
3677                 BUG_ON(ret);
3678                 cache->space_info = space_info;
3679
3680                 /* use EXTENT_LOCKED to prevent merging */
3681                 set_extent_bits(block_group_cache, found_key.objectid,
3682                                 found_key.objectid + found_key.offset - 1,
3683                                 EXTENT_LOCKED, GFP_NOFS);
3684                 set_state_private(block_group_cache, found_key.objectid,
3685                                   (unsigned long)cache);
3686                 set_extent_bits(block_group_cache, found_key.objectid,
3687                                 found_key.objectid + found_key.offset - 1,
3688                                 bit | EXTENT_LOCKED, GFP_NOFS);
3689                 if (key.objectid >=
3690                     btrfs_super_total_bytes(&info->super_copy))
3691                         break;
3692         }
3693         ret = 0;
3694 error:
3695         btrfs_free_path(path);
3696         mutex_unlock(&root->fs_info->alloc_mutex);
3697         return ret;
3698 }
3699
3700 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
3701                            struct btrfs_root *root, u64 bytes_used,
3702                            u64 type, u64 chunk_objectid, u64 chunk_offset,
3703                            u64 size)
3704 {
3705         int ret;
3706         int bit = 0;
3707         struct btrfs_root *extent_root;
3708         struct btrfs_block_group_cache *cache;
3709         struct extent_io_tree *block_group_cache;
3710
3711         WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
3712         extent_root = root->fs_info->extent_root;
3713         block_group_cache = &root->fs_info->block_group_cache;
3714
3715         root->fs_info->last_trans_new_blockgroup = trans->transid;
3716
3717         cache = kzalloc(sizeof(*cache), GFP_NOFS);
3718         BUG_ON(!cache);
3719         cache->key.objectid = chunk_offset;
3720         cache->key.offset = size;
3721         spin_lock_init(&cache->lock);
3722         btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
3723
3724         btrfs_set_block_group_used(&cache->item, bytes_used);
3725         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
3726         cache->flags = type;
3727         btrfs_set_block_group_flags(&cache->item, type);
3728
3729         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
3730                                 &cache->space_info);
3731         BUG_ON(ret);
3732
3733         bit = block_group_state_bits(type);
3734         set_extent_bits(block_group_cache, chunk_offset,
3735                         chunk_offset + size - 1,
3736                         EXTENT_LOCKED, GFP_NOFS);
3737         set_state_private(block_group_cache, chunk_offset,
3738                           (unsigned long)cache);
3739         set_extent_bits(block_group_cache, chunk_offset,
3740                         chunk_offset + size - 1,
3741                         bit | EXTENT_LOCKED, GFP_NOFS);
3742
3743         ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
3744                                 sizeof(cache->item));
3745         BUG_ON(ret);
3746
3747         finish_current_insert(trans, extent_root);
3748         ret = del_pending_extents(trans, extent_root);
3749         BUG_ON(ret);
3750         set_avail_alloc_bits(extent_root->fs_info, type);
3751
3752         return 0;
3753 }