Btrfs: Handle write errors on raid1 and raid10
[safe/jmp/linux-2.6] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include "hash.h"
22 #include "crc32c.h"
23 #include "ctree.h"
24 #include "disk-io.h"
25 #include "print-tree.h"
26 #include "transaction.h"
27 #include "volumes.h"
28
29 #define BLOCK_GROUP_DATA     EXTENT_WRITEBACK
30 #define BLOCK_GROUP_METADATA EXTENT_UPTODATE
31 #define BLOCK_GROUP_SYSTEM   EXTENT_NEW
32
33 #define BLOCK_GROUP_DIRTY EXTENT_DIRTY
34
35 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
36                                  btrfs_root *extent_root);
37 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
38                                btrfs_root *extent_root);
39
40
41 static int cache_block_group(struct btrfs_root *root,
42                              struct btrfs_block_group_cache *block_group)
43 {
44         struct btrfs_path *path;
45         int ret;
46         struct btrfs_key key;
47         struct extent_buffer *leaf;
48         struct extent_io_tree *free_space_cache;
49         int slot;
50         u64 last = 0;
51         u64 hole_size;
52         u64 first_free;
53         int found = 0;
54
55         if (!block_group)
56                 return 0;
57
58         root = root->fs_info->extent_root;
59         free_space_cache = &root->fs_info->free_space_cache;
60
61         if (block_group->cached)
62                 return 0;
63
64         path = btrfs_alloc_path();
65         if (!path)
66                 return -ENOMEM;
67
68         path->reada = 2;
69         first_free = block_group->key.objectid;
70         key.objectid = block_group->key.objectid;
71         key.offset = 0;
72         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
73         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
74         if (ret < 0)
75                 return ret;
76         ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
77         if (ret < 0)
78                 return ret;
79         if (ret == 0) {
80                 leaf = path->nodes[0];
81                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
82                 if (key.objectid + key.offset > first_free)
83                         first_free = key.objectid + key.offset;
84         }
85         while(1) {
86                 leaf = path->nodes[0];
87                 slot = path->slots[0];
88                 if (slot >= btrfs_header_nritems(leaf)) {
89                         ret = btrfs_next_leaf(root, path);
90                         if (ret < 0)
91                                 goto err;
92                         if (ret == 0) {
93                                 continue;
94                         } else {
95                                 break;
96                         }
97                 }
98                 btrfs_item_key_to_cpu(leaf, &key, slot);
99                 if (key.objectid < block_group->key.objectid) {
100                         goto next;
101                 }
102                 if (key.objectid >= block_group->key.objectid +
103                     block_group->key.offset) {
104                         break;
105                 }
106
107                 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
108                         if (!found) {
109                                 last = first_free;
110                                 found = 1;
111                         }
112                         if (key.objectid > last) {
113                                 hole_size = key.objectid - last;
114                                 set_extent_dirty(free_space_cache, last,
115                                                  last + hole_size - 1,
116                                                  GFP_NOFS);
117                         }
118                         last = key.objectid + key.offset;
119                 }
120 next:
121                 path->slots[0]++;
122         }
123
124         if (!found)
125                 last = first_free;
126         if (block_group->key.objectid +
127             block_group->key.offset > last) {
128                 hole_size = block_group->key.objectid +
129                         block_group->key.offset - last;
130                 set_extent_dirty(free_space_cache, last,
131                                  last + hole_size - 1, GFP_NOFS);
132         }
133         block_group->cached = 1;
134 err:
135         btrfs_free_path(path);
136         return 0;
137 }
138
139 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
140                                                          btrfs_fs_info *info,
141                                                          u64 bytenr)
142 {
143         struct extent_io_tree *block_group_cache;
144         struct btrfs_block_group_cache *block_group = NULL;
145         u64 ptr;
146         u64 start;
147         u64 end;
148         int ret;
149
150         bytenr = max_t(u64, bytenr,
151                        BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
152         block_group_cache = &info->block_group_cache;
153         ret = find_first_extent_bit(block_group_cache,
154                                     bytenr, &start, &end,
155                                     BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
156                                     BLOCK_GROUP_SYSTEM);
157         if (ret) {
158                 return NULL;
159         }
160         ret = get_state_private(block_group_cache, start, &ptr);
161         if (ret)
162                 return NULL;
163
164         block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr;
165         if (block_group->key.objectid <= bytenr && bytenr <
166             block_group->key.objectid + block_group->key.offset)
167                 return block_group;
168         return NULL;
169 }
170
171 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
172 {
173         return (cache->flags & bits) == bits;
174 }
175
176 static int noinline find_search_start(struct btrfs_root *root,
177                               struct btrfs_block_group_cache **cache_ret,
178                               u64 *start_ret, int num, int data)
179 {
180         int ret;
181         struct btrfs_block_group_cache *cache = *cache_ret;
182         struct extent_io_tree *free_space_cache;
183         struct extent_state *state;
184         u64 last;
185         u64 start = 0;
186         u64 cache_miss = 0;
187         u64 total_fs_bytes;
188         u64 search_start = *start_ret;
189         int wrapped = 0;
190
191         if (!cache)
192                 goto out;
193
194         total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
195         free_space_cache = &root->fs_info->free_space_cache;
196
197 again:
198         ret = cache_block_group(root, cache);
199         if (ret)
200                 goto out;
201
202         last = max(search_start, cache->key.objectid);
203         if (!block_group_bits(cache, data) || cache->ro) {
204                 goto new_group;
205         }
206
207         spin_lock_irq(&free_space_cache->lock);
208         state = find_first_extent_bit_state(free_space_cache, last, EXTENT_DIRTY);
209         while(1) {
210                 if (!state) {
211                         if (!cache_miss)
212                                 cache_miss = last;
213                         spin_unlock_irq(&free_space_cache->lock);
214                         goto new_group;
215                 }
216
217                 start = max(last, state->start);
218                 last = state->end + 1;
219                 if (last - start < num) {
220                         if (last == cache->key.objectid + cache->key.offset)
221                                 cache_miss = start;
222                         do {
223                                 state = extent_state_next(state);
224                         } while(state && !(state->state & EXTENT_DIRTY));
225                         continue;
226                 }
227                 spin_unlock_irq(&free_space_cache->lock);
228                 if (cache->ro)
229                         goto new_group;
230                 if (start + num > cache->key.objectid + cache->key.offset)
231                         goto new_group;
232                 if (start + num  > total_fs_bytes)
233                         goto new_group;
234                 if (!block_group_bits(cache, data)) {
235                         printk("block group bits don't match %Lu %d\n", cache->flags, data);
236                 }
237                 *start_ret = start;
238                 return 0;
239         }
240 out:
241         cache = btrfs_lookup_block_group(root->fs_info, search_start);
242         if (!cache) {
243                 printk("Unable to find block group for %Lu\n", search_start);
244                 WARN_ON(1);
245         }
246         return -ENOSPC;
247
248 new_group:
249         last = cache->key.objectid + cache->key.offset;
250 wrapped:
251         cache = btrfs_lookup_block_group(root->fs_info, last);
252         if (!cache || cache->key.objectid >= total_fs_bytes) {
253 no_cache:
254                 if (!wrapped) {
255                         wrapped = 1;
256                         last = search_start;
257                         goto wrapped;
258                 }
259                 goto out;
260         }
261         if (cache_miss && !cache->cached) {
262                 cache_block_group(root, cache);
263                 last = cache_miss;
264                 cache = btrfs_lookup_block_group(root->fs_info, last);
265         }
266         cache = btrfs_find_block_group(root, cache, last, data, 0);
267         if (!cache)
268                 goto no_cache;
269         *cache_ret = cache;
270         cache_miss = 0;
271         goto again;
272 }
273
274 static u64 div_factor(u64 num, int factor)
275 {
276         if (factor == 10)
277                 return num;
278         num *= factor;
279         do_div(num, 10);
280         return num;
281 }
282
283 static int block_group_state_bits(u64 flags)
284 {
285         int bits = 0;
286         if (flags & BTRFS_BLOCK_GROUP_DATA)
287                 bits |= BLOCK_GROUP_DATA;
288         if (flags & BTRFS_BLOCK_GROUP_METADATA)
289                 bits |= BLOCK_GROUP_METADATA;
290         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
291                 bits |= BLOCK_GROUP_SYSTEM;
292         return bits;
293 }
294
295 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
296                                                  struct btrfs_block_group_cache
297                                                  *hint, u64 search_start,
298                                                  int data, int owner)
299 {
300         struct btrfs_block_group_cache *cache;
301         struct extent_io_tree *block_group_cache;
302         struct btrfs_block_group_cache *found_group = NULL;
303         struct btrfs_fs_info *info = root->fs_info;
304         u64 used;
305         u64 last = 0;
306         u64 hint_last;
307         u64 start;
308         u64 end;
309         u64 free_check;
310         u64 ptr;
311         u64 total_fs_bytes;
312         int bit;
313         int ret;
314         int full_search = 0;
315         int factor = 10;
316
317         block_group_cache = &info->block_group_cache;
318         total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
319
320         if (data & BTRFS_BLOCK_GROUP_METADATA)
321                 factor = 9;
322
323         bit = block_group_state_bits(data);
324
325         if (search_start && search_start < total_fs_bytes) {
326                 struct btrfs_block_group_cache *shint;
327                 shint = btrfs_lookup_block_group(info, search_start);
328                 if (shint && block_group_bits(shint, data) && !shint->ro) {
329                         used = btrfs_block_group_used(&shint->item);
330                         if (used + shint->pinned <
331                             div_factor(shint->key.offset, factor)) {
332                                 return shint;
333                         }
334                 }
335         }
336         if (hint && !hint->ro && block_group_bits(hint, data) &&
337             hint->key.objectid < total_fs_bytes) {
338                 used = btrfs_block_group_used(&hint->item);
339                 if (used + hint->pinned <
340                     div_factor(hint->key.offset, factor)) {
341                         return hint;
342                 }
343                 last = hint->key.objectid + hint->key.offset;
344                 hint_last = last;
345         } else {
346                 if (hint)
347                         hint_last = max(hint->key.objectid, search_start);
348                 else
349                         hint_last = search_start;
350
351                 if (hint_last >= total_fs_bytes)
352                         hint_last = search_start;
353                 last = hint_last;
354         }
355 again:
356         while(1) {
357                 ret = find_first_extent_bit(block_group_cache, last,
358                                             &start, &end, bit);
359                 if (ret)
360                         break;
361
362                 ret = get_state_private(block_group_cache, start, &ptr);
363                 if (ret)
364                         break;
365
366                 cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
367                 last = cache->key.objectid + cache->key.offset;
368                 used = btrfs_block_group_used(&cache->item);
369
370                 if (cache->key.objectid > total_fs_bytes)
371                         break;
372
373                 if (!cache->ro && block_group_bits(cache, data)) {
374                         if (full_search)
375                                 free_check = cache->key.offset;
376                         else
377                                 free_check = div_factor(cache->key.offset,
378                                                         factor);
379
380                         if (used + cache->pinned < free_check) {
381                                 found_group = cache;
382                                 goto found;
383                         }
384                 }
385                 cond_resched();
386         }
387         if (!full_search) {
388                 last = search_start;
389                 full_search = 1;
390                 goto again;
391         }
392 found:
393         return found_group;
394 }
395
396 static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation,
397                            u64 owner, u64 owner_offset)
398 {
399         u32 high_crc = ~(u32)0;
400         u32 low_crc = ~(u32)0;
401         __le64 lenum;
402         lenum = cpu_to_le64(root_objectid);
403         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
404         lenum = cpu_to_le64(ref_generation);
405         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
406         if (owner >= BTRFS_FIRST_FREE_OBJECTID) {
407                 lenum = cpu_to_le64(owner);
408                 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
409                 lenum = cpu_to_le64(owner_offset);
410                 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
411         }
412         return ((u64)high_crc << 32) | (u64)low_crc;
413 }
414
415 static int match_extent_ref(struct extent_buffer *leaf,
416                             struct btrfs_extent_ref *disk_ref,
417                             struct btrfs_extent_ref *cpu_ref)
418 {
419         int ret;
420         int len;
421
422         if (cpu_ref->objectid)
423                 len = sizeof(*cpu_ref);
424         else
425                 len = 2 * sizeof(u64);
426         ret = memcmp_extent_buffer(leaf, cpu_ref, (unsigned long)disk_ref,
427                                    len);
428         return ret == 0;
429 }
430
431 static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
432                                           struct btrfs_root *root,
433                                           struct btrfs_path *path, u64 bytenr,
434                                           u64 root_objectid,
435                                           u64 ref_generation, u64 owner,
436                                           u64 owner_offset, int del)
437 {
438         u64 hash;
439         struct btrfs_key key;
440         struct btrfs_key found_key;
441         struct btrfs_extent_ref ref;
442         struct extent_buffer *leaf;
443         struct btrfs_extent_ref *disk_ref;
444         int ret;
445         int ret2;
446
447         btrfs_set_stack_ref_root(&ref, root_objectid);
448         btrfs_set_stack_ref_generation(&ref, ref_generation);
449         btrfs_set_stack_ref_objectid(&ref, owner);
450         btrfs_set_stack_ref_offset(&ref, owner_offset);
451
452         hash = hash_extent_ref(root_objectid, ref_generation, owner,
453                                owner_offset);
454         key.offset = hash;
455         key.objectid = bytenr;
456         key.type = BTRFS_EXTENT_REF_KEY;
457
458         while (1) {
459                 ret = btrfs_search_slot(trans, root, &key, path,
460                                         del ? -1 : 0, del);
461                 if (ret < 0)
462                         goto out;
463                 leaf = path->nodes[0];
464                 if (ret != 0) {
465                         u32 nritems = btrfs_header_nritems(leaf);
466                         if (path->slots[0] >= nritems) {
467                                 ret2 = btrfs_next_leaf(root, path);
468                                 if (ret2)
469                                         goto out;
470                                 leaf = path->nodes[0];
471                         }
472                         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
473                         if (found_key.objectid != bytenr ||
474                             found_key.type != BTRFS_EXTENT_REF_KEY)
475                                 goto out;
476                         key.offset = found_key.offset;
477                         if (del) {
478                                 btrfs_release_path(root, path);
479                                 continue;
480                         }
481                 }
482                 disk_ref = btrfs_item_ptr(path->nodes[0],
483                                           path->slots[0],
484                                           struct btrfs_extent_ref);
485                 if (match_extent_ref(path->nodes[0], disk_ref, &ref)) {
486                         ret = 0;
487                         goto out;
488                 }
489                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
490                 key.offset = found_key.offset + 1;
491                 btrfs_release_path(root, path);
492         }
493 out:
494         return ret;
495 }
496
497 /*
498  * Back reference rules.  Back refs have three main goals:
499  *
500  * 1) differentiate between all holders of references to an extent so that
501  *    when a reference is dropped we can make sure it was a valid reference
502  *    before freeing the extent.
503  *
504  * 2) Provide enough information to quickly find the holders of an extent
505  *    if we notice a given block is corrupted or bad.
506  *
507  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
508  *    maintenance.  This is actually the same as #2, but with a slightly
509  *    different use case.
510  *
511  * File extents can be referenced by:
512  *
513  * - multiple snapshots, subvolumes, or different generations in one subvol
514  * - different files inside a single subvolume (in theory, not implemented yet)
515  * - different offsets inside a file (bookend extents in file.c)
516  *
517  * The extent ref structure has fields for:
518  *
519  * - Objectid of the subvolume root
520  * - Generation number of the tree holding the reference
521  * - objectid of the file holding the reference
522  * - offset in the file corresponding to the key holding the reference
523  *
524  * When a file extent is allocated the fields are filled in:
525  *     (root_key.objectid, trans->transid, inode objectid, offset in file)
526  *
527  * When a leaf is cow'd new references are added for every file extent found
528  * in the leaf.  It looks the same as the create case, but trans->transid
529  * will be different when the block is cow'd.
530  *
531  *     (root_key.objectid, trans->transid, inode objectid, offset in file)
532  *
533  * When a file extent is removed either during snapshot deletion or file
534  * truncation, the corresponding back reference is found
535  * by searching for:
536  *
537  *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
538  *      inode objectid, offset in file)
539  *
540  * Btree extents can be referenced by:
541  *
542  * - Different subvolumes
543  * - Different generations of the same subvolume
544  *
545  * Storing sufficient information for a full reverse mapping of a btree
546  * block would require storing the lowest key of the block in the backref,
547  * and it would require updating that lowest key either before write out or
548  * every time it changed.  Instead, the objectid of the lowest key is stored
549  * along with the level of the tree block.  This provides a hint
550  * about where in the btree the block can be found.  Searches through the
551  * btree only need to look for a pointer to that block, so they stop one
552  * level higher than the level recorded in the backref.
553  *
554  * Some btrees do not do reference counting on their extents.  These
555  * include the extent tree and the tree of tree roots.  Backrefs for these
556  * trees always have a generation of zero.
557  *
558  * When a tree block is created, back references are inserted:
559  *
560  * (root->root_key.objectid, trans->transid or zero, level, lowest_key_objectid)
561  *
562  * When a tree block is cow'd in a reference counted root,
563  * new back references are added for all the blocks it points to.
564  * These are of the form (trans->transid will have increased since creation):
565  *
566  * (root->root_key.objectid, trans->transid, level, lowest_key_objectid)
567  *
568  * Because the lowest_key_objectid and the level are just hints
569  * they are not used when backrefs are deleted.  When a backref is deleted:
570  *
571  * if backref was for a tree root:
572  *     root_objectid = root->root_key.objectid
573  * else
574  *     root_objectid = btrfs_header_owner(parent)
575  *
576  * (root_objectid, btrfs_header_generation(parent) or zero, 0, 0)
577  *
578  * Back Reference Key hashing:
579  *
580  * Back references have four fields, each 64 bits long.  Unfortunately,
581  * This is hashed into a single 64 bit number and placed into the key offset.
582  * The key objectid corresponds to the first byte in the extent, and the
583  * key type is set to BTRFS_EXTENT_REF_KEY
584  */
585 int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
586                                  struct btrfs_root *root,
587                                  struct btrfs_path *path, u64 bytenr,
588                                  u64 root_objectid, u64 ref_generation,
589                                  u64 owner, u64 owner_offset)
590 {
591         u64 hash;
592         struct btrfs_key key;
593         struct btrfs_extent_ref ref;
594         struct btrfs_extent_ref *disk_ref;
595         int ret;
596
597         btrfs_set_stack_ref_root(&ref, root_objectid);
598         btrfs_set_stack_ref_generation(&ref, ref_generation);
599         btrfs_set_stack_ref_objectid(&ref, owner);
600         btrfs_set_stack_ref_offset(&ref, owner_offset);
601
602         hash = hash_extent_ref(root_objectid, ref_generation, owner,
603                                owner_offset);
604         key.offset = hash;
605         key.objectid = bytenr;
606         key.type = BTRFS_EXTENT_REF_KEY;
607
608         ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(ref));
609         while (ret == -EEXIST) {
610                 disk_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
611                                           struct btrfs_extent_ref);
612                 if (match_extent_ref(path->nodes[0], disk_ref, &ref))
613                         goto out;
614                 key.offset++;
615                 btrfs_release_path(root, path);
616                 ret = btrfs_insert_empty_item(trans, root, path, &key,
617                                               sizeof(ref));
618         }
619         if (ret)
620                 goto out;
621         disk_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
622                                   struct btrfs_extent_ref);
623         write_extent_buffer(path->nodes[0], &ref, (unsigned long)disk_ref,
624                             sizeof(ref));
625         btrfs_mark_buffer_dirty(path->nodes[0]);
626 out:
627         btrfs_release_path(root, path);
628         return ret;
629 }
630
631 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
632                                 struct btrfs_root *root,
633                                 u64 bytenr, u64 num_bytes,
634                                 u64 root_objectid, u64 ref_generation,
635                                 u64 owner, u64 owner_offset)
636 {
637         struct btrfs_path *path;
638         int ret;
639         struct btrfs_key key;
640         struct extent_buffer *l;
641         struct btrfs_extent_item *item;
642         u32 refs;
643
644         WARN_ON(num_bytes < root->sectorsize);
645         path = btrfs_alloc_path();
646         if (!path)
647                 return -ENOMEM;
648
649         path->reada = 1;
650         key.objectid = bytenr;
651         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
652         key.offset = num_bytes;
653         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
654                                 0, 1);
655         if (ret < 0)
656                 return ret;
657         if (ret != 0) {
658                 BUG();
659         }
660         BUG_ON(ret != 0);
661         l = path->nodes[0];
662         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
663         refs = btrfs_extent_refs(l, item);
664         btrfs_set_extent_refs(l, item, refs + 1);
665         btrfs_mark_buffer_dirty(path->nodes[0]);
666
667         btrfs_release_path(root->fs_info->extent_root, path);
668
669         path->reada = 1;
670         ret = btrfs_insert_extent_backref(trans, root->fs_info->extent_root,
671                                           path, bytenr, root_objectid,
672                                           ref_generation, owner, owner_offset);
673         BUG_ON(ret);
674         finish_current_insert(trans, root->fs_info->extent_root);
675         del_pending_extents(trans, root->fs_info->extent_root);
676
677         btrfs_free_path(path);
678         return 0;
679 }
680
681 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
682                          struct btrfs_root *root)
683 {
684         finish_current_insert(trans, root->fs_info->extent_root);
685         del_pending_extents(trans, root->fs_info->extent_root);
686         return 0;
687 }
688
689 static int lookup_extent_ref(struct btrfs_trans_handle *trans,
690                              struct btrfs_root *root, u64 bytenr,
691                              u64 num_bytes, u32 *refs)
692 {
693         struct btrfs_path *path;
694         int ret;
695         struct btrfs_key key;
696         struct extent_buffer *l;
697         struct btrfs_extent_item *item;
698
699         WARN_ON(num_bytes < root->sectorsize);
700         path = btrfs_alloc_path();
701         path->reada = 1;
702         key.objectid = bytenr;
703         key.offset = num_bytes;
704         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
705         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
706                                 0, 0);
707         if (ret < 0)
708                 goto out;
709         if (ret != 0) {
710                 btrfs_print_leaf(root, path->nodes[0]);
711                 printk("failed to find block number %Lu\n", bytenr);
712                 BUG();
713         }
714         l = path->nodes[0];
715         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
716         *refs = btrfs_extent_refs(l, item);
717 out:
718         btrfs_free_path(path);
719         return 0;
720 }
721
722 u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
723                                   struct btrfs_path *count_path,
724                                   u64 expected_owner,
725                                   u64 first_extent)
726 {
727         struct btrfs_root *extent_root = root->fs_info->extent_root;
728         struct btrfs_path *path;
729         u64 bytenr;
730         u64 found_objectid;
731         u64 found_owner;
732         u64 root_objectid = root->root_key.objectid;
733         u32 total_count = 0;
734         u32 extent_refs;
735         u32 cur_count;
736         u32 nritems;
737         int ret;
738         struct btrfs_key key;
739         struct btrfs_key found_key;
740         struct extent_buffer *l;
741         struct btrfs_extent_item *item;
742         struct btrfs_extent_ref *ref_item;
743         int level = -1;
744
745         path = btrfs_alloc_path();
746 again:
747         if (level == -1)
748                 bytenr = first_extent;
749         else
750                 bytenr = count_path->nodes[level]->start;
751
752         cur_count = 0;
753         key.objectid = bytenr;
754         key.offset = 0;
755
756         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
757         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
758         if (ret < 0)
759                 goto out;
760         BUG_ON(ret == 0);
761
762         l = path->nodes[0];
763         btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
764
765         if (found_key.objectid != bytenr ||
766             found_key.type != BTRFS_EXTENT_ITEM_KEY) {
767                 goto out;
768         }
769
770         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
771         extent_refs = btrfs_extent_refs(l, item);
772         while (1) {
773                 l = path->nodes[0];
774                 nritems = btrfs_header_nritems(l);
775                 if (path->slots[0] >= nritems) {
776                         ret = btrfs_next_leaf(extent_root, path);
777                         if (ret == 0)
778                                 continue;
779                         break;
780                 }
781                 btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
782                 if (found_key.objectid != bytenr)
783                         break;
784
785                 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
786                         path->slots[0]++;
787                         continue;
788                 }
789
790                 cur_count++;
791                 ref_item = btrfs_item_ptr(l, path->slots[0],
792                                           struct btrfs_extent_ref);
793                 found_objectid = btrfs_ref_root(l, ref_item);
794
795                 if (found_objectid != root_objectid) {
796                         total_count = 2;
797                         goto out;
798                 }
799                 if (level == -1) {
800                         found_owner = btrfs_ref_objectid(l, ref_item);
801                         if (found_owner != expected_owner) {
802                                 total_count = 2;
803                                 goto out;
804                         }
805                         /*
806                          * nasty.  we don't count a reference held by
807                          * the running transaction.  This allows nodatacow
808                          * to avoid cow most of the time
809                          */
810                         if (found_owner >= BTRFS_FIRST_FREE_OBJECTID &&
811                             btrfs_ref_generation(l, ref_item) ==
812                             root->fs_info->generation) {
813                                 extent_refs--;
814                         }
815                 }
816                 total_count = 1;
817                 path->slots[0]++;
818         }
819         /*
820          * if there is more than one reference against a data extent,
821          * we have to assume the other ref is another snapshot
822          */
823         if (level == -1 && extent_refs > 1) {
824                 total_count = 2;
825                 goto out;
826         }
827         if (cur_count == 0) {
828                 total_count = 0;
829                 goto out;
830         }
831         if (level >= 0 && root->node == count_path->nodes[level])
832                 goto out;
833         level++;
834         btrfs_release_path(root, path);
835         goto again;
836
837 out:
838         btrfs_free_path(path);
839         return total_count;
840 }
841 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
842                        struct btrfs_root *root, u64 owner_objectid)
843 {
844         u64 generation;
845         u64 key_objectid;
846         u64 level;
847         u32 nritems;
848         struct btrfs_disk_key disk_key;
849
850         level = btrfs_header_level(root->node);
851         generation = trans->transid;
852         nritems = btrfs_header_nritems(root->node);
853         if (nritems > 0) {
854                 if (level == 0)
855                         btrfs_item_key(root->node, &disk_key, 0);
856                 else
857                         btrfs_node_key(root->node, &disk_key, 0);
858                 key_objectid = btrfs_disk_key_objectid(&disk_key);
859         } else {
860                 key_objectid = 0;
861         }
862         return btrfs_inc_extent_ref(trans, root, root->node->start,
863                                     root->node->len, owner_objectid,
864                                     generation, level, key_objectid);
865 }
866
867 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
868                   struct extent_buffer *buf)
869 {
870         u64 bytenr;
871         u32 nritems;
872         struct btrfs_key key;
873         struct btrfs_file_extent_item *fi;
874         int i;
875         int level;
876         int ret;
877         int faili;
878
879         if (!root->ref_cows)
880                 return 0;
881
882         level = btrfs_header_level(buf);
883         nritems = btrfs_header_nritems(buf);
884         for (i = 0; i < nritems; i++) {
885                 if (level == 0) {
886                         u64 disk_bytenr;
887                         btrfs_item_key_to_cpu(buf, &key, i);
888                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
889                                 continue;
890                         fi = btrfs_item_ptr(buf, i,
891                                             struct btrfs_file_extent_item);
892                         if (btrfs_file_extent_type(buf, fi) ==
893                             BTRFS_FILE_EXTENT_INLINE)
894                                 continue;
895                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
896                         if (disk_bytenr == 0)
897                                 continue;
898                         ret = btrfs_inc_extent_ref(trans, root, disk_bytenr,
899                                     btrfs_file_extent_disk_num_bytes(buf, fi),
900                                     root->root_key.objectid, trans->transid,
901                                     key.objectid, key.offset);
902                         if (ret) {
903                                 faili = i;
904                                 goto fail;
905                         }
906                 } else {
907                         bytenr = btrfs_node_blockptr(buf, i);
908                         btrfs_node_key_to_cpu(buf, &key, i);
909                         ret = btrfs_inc_extent_ref(trans, root, bytenr,
910                                            btrfs_level_size(root, level - 1),
911                                            root->root_key.objectid,
912                                            trans->transid,
913                                            level - 1, key.objectid);
914                         if (ret) {
915                                 faili = i;
916                                 goto fail;
917                         }
918                 }
919         }
920         return 0;
921 fail:
922         WARN_ON(1);
923 #if 0
924         for (i =0; i < faili; i++) {
925                 if (level == 0) {
926                         u64 disk_bytenr;
927                         btrfs_item_key_to_cpu(buf, &key, i);
928                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
929                                 continue;
930                         fi = btrfs_item_ptr(buf, i,
931                                             struct btrfs_file_extent_item);
932                         if (btrfs_file_extent_type(buf, fi) ==
933                             BTRFS_FILE_EXTENT_INLINE)
934                                 continue;
935                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
936                         if (disk_bytenr == 0)
937                                 continue;
938                         err = btrfs_free_extent(trans, root, disk_bytenr,
939                                     btrfs_file_extent_disk_num_bytes(buf,
940                                                                       fi), 0);
941                         BUG_ON(err);
942                 } else {
943                         bytenr = btrfs_node_blockptr(buf, i);
944                         err = btrfs_free_extent(trans, root, bytenr,
945                                         btrfs_level_size(root, level - 1), 0);
946                         BUG_ON(err);
947                 }
948         }
949 #endif
950         return ret;
951 }
952
953 static int write_one_cache_group(struct btrfs_trans_handle *trans,
954                                  struct btrfs_root *root,
955                                  struct btrfs_path *path,
956                                  struct btrfs_block_group_cache *cache)
957 {
958         int ret;
959         int pending_ret;
960         struct btrfs_root *extent_root = root->fs_info->extent_root;
961         unsigned long bi;
962         struct extent_buffer *leaf;
963
964         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
965         if (ret < 0)
966                 goto fail;
967         BUG_ON(ret);
968
969         leaf = path->nodes[0];
970         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
971         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
972         btrfs_mark_buffer_dirty(leaf);
973         btrfs_release_path(extent_root, path);
974 fail:
975         finish_current_insert(trans, extent_root);
976         pending_ret = del_pending_extents(trans, extent_root);
977         if (ret)
978                 return ret;
979         if (pending_ret)
980                 return pending_ret;
981         return 0;
982
983 }
984
985 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
986                                    struct btrfs_root *root)
987 {
988         struct extent_io_tree *block_group_cache;
989         struct btrfs_block_group_cache *cache;
990         int ret;
991         int err = 0;
992         int werr = 0;
993         struct btrfs_path *path;
994         u64 last = 0;
995         u64 start;
996         u64 end;
997         u64 ptr;
998
999         block_group_cache = &root->fs_info->block_group_cache;
1000         path = btrfs_alloc_path();
1001         if (!path)
1002                 return -ENOMEM;
1003
1004         while(1) {
1005                 ret = find_first_extent_bit(block_group_cache, last,
1006                                             &start, &end, BLOCK_GROUP_DIRTY);
1007                 if (ret)
1008                         break;
1009
1010                 last = end + 1;
1011                 ret = get_state_private(block_group_cache, start, &ptr);
1012                 if (ret)
1013                         break;
1014                 cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
1015                 err = write_one_cache_group(trans, root,
1016                                             path, cache);
1017                 /*
1018                  * if we fail to write the cache group, we want
1019                  * to keep it marked dirty in hopes that a later
1020                  * write will work
1021                  */
1022                 if (err) {
1023                         werr = err;
1024                         continue;
1025                 }
1026                 clear_extent_bits(block_group_cache, start, end,
1027                                   BLOCK_GROUP_DIRTY, GFP_NOFS);
1028         }
1029         btrfs_free_path(path);
1030         return werr;
1031 }
1032
1033 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
1034                                                   u64 flags)
1035 {
1036         struct list_head *head = &info->space_info;
1037         struct list_head *cur;
1038         struct btrfs_space_info *found;
1039         list_for_each(cur, head) {
1040                 found = list_entry(cur, struct btrfs_space_info, list);
1041                 if (found->flags == flags)
1042                         return found;
1043         }
1044         return NULL;
1045
1046 }
1047
1048 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1049                              u64 total_bytes, u64 bytes_used,
1050                              struct btrfs_space_info **space_info)
1051 {
1052         struct btrfs_space_info *found;
1053
1054         found = __find_space_info(info, flags);
1055         if (found) {
1056                 found->total_bytes += total_bytes;
1057                 found->bytes_used += bytes_used;
1058                 found->full = 0;
1059                 WARN_ON(found->total_bytes < found->bytes_used);
1060                 *space_info = found;
1061                 return 0;
1062         }
1063         found = kmalloc(sizeof(*found), GFP_NOFS);
1064         if (!found)
1065                 return -ENOMEM;
1066
1067         list_add(&found->list, &info->space_info);
1068         found->flags = flags;
1069         found->total_bytes = total_bytes;
1070         found->bytes_used = bytes_used;
1071         found->bytes_pinned = 0;
1072         found->full = 0;
1073         *space_info = found;
1074         return 0;
1075 }
1076
1077 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1078 {
1079         u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1080                                    BTRFS_BLOCK_GROUP_RAID1 |
1081                                    BTRFS_BLOCK_GROUP_RAID10 |
1082                                    BTRFS_BLOCK_GROUP_DUP);
1083         if (extra_flags) {
1084                 if (flags & BTRFS_BLOCK_GROUP_DATA)
1085                         fs_info->avail_data_alloc_bits |= extra_flags;
1086                 if (flags & BTRFS_BLOCK_GROUP_METADATA)
1087                         fs_info->avail_metadata_alloc_bits |= extra_flags;
1088                 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1089                         fs_info->avail_system_alloc_bits |= extra_flags;
1090         }
1091 }
1092
1093 static u64 reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1094 {
1095         u64 num_devices = root->fs_info->fs_devices->num_devices;
1096
1097         if (num_devices == 1)
1098                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1099         if (num_devices < 4)
1100                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1101
1102         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1103             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1104                       BTRFS_BLOCK_GROUP_RAID10))) {
1105                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
1106         }
1107
1108         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1109             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1110                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1111         }
1112
1113         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1114             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1115              (flags & BTRFS_BLOCK_GROUP_RAID10) |
1116              (flags & BTRFS_BLOCK_GROUP_DUP)))
1117                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1118         return flags;
1119 }
1120
1121 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1122                           struct btrfs_root *extent_root, u64 alloc_bytes,
1123                           u64 flags)
1124 {
1125         struct btrfs_space_info *space_info;
1126         u64 thresh;
1127         u64 start;
1128         u64 num_bytes;
1129         int ret;
1130
1131         flags = reduce_alloc_profile(extent_root, flags);
1132
1133         space_info = __find_space_info(extent_root->fs_info, flags);
1134         if (!space_info) {
1135                 ret = update_space_info(extent_root->fs_info, flags,
1136                                         0, 0, &space_info);
1137                 BUG_ON(ret);
1138         }
1139         BUG_ON(!space_info);
1140
1141         if (space_info->full)
1142                 return 0;
1143
1144         thresh = div_factor(space_info->total_bytes, 6);
1145         if ((space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
1146             thresh)
1147                 return 0;
1148
1149         ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
1150         if (ret == -ENOSPC) {
1151 printk("space info full %Lu\n", flags);
1152                 space_info->full = 1;
1153                 return 0;
1154         }
1155
1156         BUG_ON(ret);
1157
1158         ret = btrfs_make_block_group(trans, extent_root, 0, flags,
1159                      BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
1160         BUG_ON(ret);
1161
1162         return 0;
1163 }
1164
1165 static int update_block_group(struct btrfs_trans_handle *trans,
1166                               struct btrfs_root *root,
1167                               u64 bytenr, u64 num_bytes, int alloc,
1168                               int mark_free)
1169 {
1170         struct btrfs_block_group_cache *cache;
1171         struct btrfs_fs_info *info = root->fs_info;
1172         u64 total = num_bytes;
1173         u64 old_val;
1174         u64 byte_in_group;
1175         u64 start;
1176         u64 end;
1177
1178         while(total) {
1179                 cache = btrfs_lookup_block_group(info, bytenr);
1180                 if (!cache) {
1181                         return -1;
1182                 }
1183                 byte_in_group = bytenr - cache->key.objectid;
1184                 WARN_ON(byte_in_group > cache->key.offset);
1185                 start = cache->key.objectid;
1186                 end = start + cache->key.offset - 1;
1187                 set_extent_bits(&info->block_group_cache, start, end,
1188                                 BLOCK_GROUP_DIRTY, GFP_NOFS);
1189
1190                 old_val = btrfs_block_group_used(&cache->item);
1191                 num_bytes = min(total, cache->key.offset - byte_in_group);
1192                 if (alloc) {
1193                         old_val += num_bytes;
1194                         cache->space_info->bytes_used += num_bytes;
1195                 } else {
1196                         old_val -= num_bytes;
1197                         cache->space_info->bytes_used -= num_bytes;
1198                         if (mark_free) {
1199                                 set_extent_dirty(&info->free_space_cache,
1200                                                  bytenr, bytenr + num_bytes - 1,
1201                                                  GFP_NOFS);
1202                         }
1203                 }
1204                 btrfs_set_block_group_used(&cache->item, old_val);
1205                 total -= num_bytes;
1206                 bytenr += num_bytes;
1207         }
1208         return 0;
1209 }
1210
1211 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
1212 {
1213         u64 start;
1214         u64 end;
1215         int ret;
1216         ret = find_first_extent_bit(&root->fs_info->block_group_cache,
1217                                     search_start, &start, &end,
1218                                     BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
1219                                     BLOCK_GROUP_SYSTEM);
1220         if (ret)
1221                 return 0;
1222         return start;
1223 }
1224
1225
1226 static int update_pinned_extents(struct btrfs_root *root,
1227                                 u64 bytenr, u64 num, int pin)
1228 {
1229         u64 len;
1230         struct btrfs_block_group_cache *cache;
1231         struct btrfs_fs_info *fs_info = root->fs_info;
1232
1233         if (pin) {
1234                 set_extent_dirty(&fs_info->pinned_extents,
1235                                 bytenr, bytenr + num - 1, GFP_NOFS);
1236         } else {
1237                 clear_extent_dirty(&fs_info->pinned_extents,
1238                                 bytenr, bytenr + num - 1, GFP_NOFS);
1239         }
1240         while (num > 0) {
1241                 cache = btrfs_lookup_block_group(fs_info, bytenr);
1242                 if (!cache) {
1243                         u64 first = first_logical_byte(root, bytenr);
1244                         WARN_ON(first < bytenr);
1245                         len = min(first - bytenr, num);
1246                 } else {
1247                         len = min(num, cache->key.offset -
1248                                   (bytenr - cache->key.objectid));
1249                 }
1250                 if (pin) {
1251                         if (cache) {
1252                                 cache->pinned += len;
1253                                 cache->space_info->bytes_pinned += len;
1254                         }
1255                         fs_info->total_pinned += len;
1256                 } else {
1257                         if (cache) {
1258                                 cache->pinned -= len;
1259                                 cache->space_info->bytes_pinned -= len;
1260                         }
1261                         fs_info->total_pinned -= len;
1262                 }
1263                 bytenr += len;
1264                 num -= len;
1265         }
1266         return 0;
1267 }
1268
1269 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
1270 {
1271         u64 last = 0;
1272         u64 start;
1273         u64 end;
1274         struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
1275         int ret;
1276
1277         while(1) {
1278                 ret = find_first_extent_bit(pinned_extents, last,
1279                                             &start, &end, EXTENT_DIRTY);
1280                 if (ret)
1281                         break;
1282                 set_extent_dirty(copy, start, end, GFP_NOFS);
1283                 last = end + 1;
1284         }
1285         return 0;
1286 }
1287
1288 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1289                                struct btrfs_root *root,
1290                                struct extent_io_tree *unpin)
1291 {
1292         u64 start;
1293         u64 end;
1294         int ret;
1295         struct extent_io_tree *free_space_cache;
1296         free_space_cache = &root->fs_info->free_space_cache;
1297
1298         while(1) {
1299                 ret = find_first_extent_bit(unpin, 0, &start, &end,
1300                                             EXTENT_DIRTY);
1301                 if (ret)
1302                         break;
1303                 update_pinned_extents(root, start, end + 1 - start, 0);
1304                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
1305                 set_extent_dirty(free_space_cache, start, end, GFP_NOFS);
1306         }
1307         return 0;
1308 }
1309
1310 static int finish_current_insert(struct btrfs_trans_handle *trans,
1311                                  struct btrfs_root *extent_root)
1312 {
1313         u64 start;
1314         u64 end;
1315         struct btrfs_fs_info *info = extent_root->fs_info;
1316         struct extent_buffer *eb;
1317         struct btrfs_path *path;
1318         struct btrfs_key ins;
1319         struct btrfs_disk_key first;
1320         struct btrfs_extent_item extent_item;
1321         int ret;
1322         int level;
1323         int err = 0;
1324
1325         btrfs_set_stack_extent_refs(&extent_item, 1);
1326         btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
1327         path = btrfs_alloc_path();
1328
1329         while(1) {
1330                 ret = find_first_extent_bit(&info->extent_ins, 0, &start,
1331                                             &end, EXTENT_LOCKED);
1332                 if (ret)
1333                         break;
1334
1335                 ins.objectid = start;
1336                 ins.offset = end + 1 - start;
1337                 err = btrfs_insert_item(trans, extent_root, &ins,
1338                                         &extent_item, sizeof(extent_item));
1339                 clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED,
1340                                   GFP_NOFS);
1341                 eb = read_tree_block(extent_root, ins.objectid, ins.offset,
1342                                      trans->transid);
1343                 level = btrfs_header_level(eb);
1344                 if (level == 0) {
1345                         btrfs_item_key(eb, &first, 0);
1346                 } else {
1347                         btrfs_node_key(eb, &first, 0);
1348                 }
1349                 err = btrfs_insert_extent_backref(trans, extent_root, path,
1350                                           start, extent_root->root_key.objectid,
1351                                           0, level,
1352                                           btrfs_disk_key_objectid(&first));
1353                 BUG_ON(err);
1354                 free_extent_buffer(eb);
1355         }
1356         btrfs_free_path(path);
1357         return 0;
1358 }
1359
1360 static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
1361                           int pending)
1362 {
1363         int err = 0;
1364         struct extent_buffer *buf;
1365
1366         if (!pending) {
1367                 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
1368                 if (buf) {
1369                         if (btrfs_buffer_uptodate(buf, 0)) {
1370                                 u64 transid =
1371                                     root->fs_info->running_transaction->transid;
1372                                 u64 header_transid =
1373                                         btrfs_header_generation(buf);
1374                                 if (header_transid == transid &&
1375                                     !btrfs_header_flag(buf,
1376                                                BTRFS_HEADER_FLAG_WRITTEN)) {
1377                                         clean_tree_block(NULL, root, buf);
1378                                         free_extent_buffer(buf);
1379                                         return 1;
1380                                 }
1381                         }
1382                         free_extent_buffer(buf);
1383                 }
1384                 update_pinned_extents(root, bytenr, num_bytes, 1);
1385         } else {
1386                 set_extent_bits(&root->fs_info->pending_del,
1387                                 bytenr, bytenr + num_bytes - 1,
1388                                 EXTENT_LOCKED, GFP_NOFS);
1389         }
1390         BUG_ON(err < 0);
1391         return 0;
1392 }
1393
1394 /*
1395  * remove an extent from the root, returns 0 on success
1396  */
1397 static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
1398                          *root, u64 bytenr, u64 num_bytes,
1399                          u64 root_objectid, u64 ref_generation,
1400                          u64 owner_objectid, u64 owner_offset, int pin,
1401                          int mark_free)
1402 {
1403         struct btrfs_path *path;
1404         struct btrfs_key key;
1405         struct btrfs_fs_info *info = root->fs_info;
1406         struct btrfs_root *extent_root = info->extent_root;
1407         struct extent_buffer *leaf;
1408         int ret;
1409         int extent_slot = 0;
1410         int found_extent = 0;
1411         int num_to_del = 1;
1412         struct btrfs_extent_item *ei;
1413         u32 refs;
1414
1415         key.objectid = bytenr;
1416         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
1417         key.offset = num_bytes;
1418         path = btrfs_alloc_path();
1419         if (!path)
1420                 return -ENOMEM;
1421
1422         path->reada = 1;
1423         ret = lookup_extent_backref(trans, extent_root, path,
1424                                     bytenr, root_objectid,
1425                                     ref_generation,
1426                                     owner_objectid, owner_offset, 1);
1427         if (ret == 0) {
1428                 struct btrfs_key found_key;
1429                 extent_slot = path->slots[0];
1430                 while(extent_slot > 0) {
1431                         extent_slot--;
1432                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1433                                               extent_slot);
1434                         if (found_key.objectid != bytenr)
1435                                 break;
1436                         if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
1437                             found_key.offset == num_bytes) {
1438                                 found_extent = 1;
1439                                 break;
1440                         }
1441                         if (path->slots[0] - extent_slot > 5)
1442                                 break;
1443                 }
1444                 if (!found_extent)
1445                         ret = btrfs_del_item(trans, extent_root, path);
1446         } else {
1447                 btrfs_print_leaf(extent_root, path->nodes[0]);
1448                 WARN_ON(1);
1449                 printk("Unable to find ref byte nr %Lu root %Lu "
1450                        " gen %Lu owner %Lu offset %Lu\n", bytenr,
1451                        root_objectid, ref_generation, owner_objectid,
1452                        owner_offset);
1453         }
1454         if (!found_extent) {
1455                 btrfs_release_path(extent_root, path);
1456                 ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
1457                 if (ret < 0)
1458                         return ret;
1459                 BUG_ON(ret);
1460                 extent_slot = path->slots[0];
1461         }
1462
1463         leaf = path->nodes[0];
1464         ei = btrfs_item_ptr(leaf, extent_slot,
1465                             struct btrfs_extent_item);
1466         refs = btrfs_extent_refs(leaf, ei);
1467         BUG_ON(refs == 0);
1468         refs -= 1;
1469         btrfs_set_extent_refs(leaf, ei, refs);
1470
1471         btrfs_mark_buffer_dirty(leaf);
1472
1473         if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
1474                 /* if the back ref and the extent are next to each other
1475                  * they get deleted below in one shot
1476                  */
1477                 path->slots[0] = extent_slot;
1478                 num_to_del = 2;
1479         } else if (found_extent) {
1480                 /* otherwise delete the extent back ref */
1481                 ret = btrfs_del_item(trans, extent_root, path);
1482                 BUG_ON(ret);
1483                 /* if refs are 0, we need to setup the path for deletion */
1484                 if (refs == 0) {
1485                         btrfs_release_path(extent_root, path);
1486                         ret = btrfs_search_slot(trans, extent_root, &key, path,
1487                                                 -1, 1);
1488                         if (ret < 0)
1489                                 return ret;
1490                         BUG_ON(ret);
1491                 }
1492         }
1493
1494         if (refs == 0) {
1495                 u64 super_used;
1496                 u64 root_used;
1497
1498                 if (pin) {
1499                         ret = pin_down_bytes(root, bytenr, num_bytes, 0);
1500                         if (ret > 0)
1501                                 mark_free = 1;
1502                         BUG_ON(ret < 0);
1503                 }
1504
1505                 /* block accounting for super block */
1506                 super_used = btrfs_super_bytes_used(&info->super_copy);
1507                 btrfs_set_super_bytes_used(&info->super_copy,
1508                                            super_used - num_bytes);
1509
1510                 /* block accounting for root item */
1511                 root_used = btrfs_root_used(&root->root_item);
1512                 btrfs_set_root_used(&root->root_item,
1513                                            root_used - num_bytes);
1514                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
1515                                       num_to_del);
1516                 if (ret) {
1517                         return ret;
1518                 }
1519                 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
1520                                          mark_free);
1521                 BUG_ON(ret);
1522         }
1523         btrfs_free_path(path);
1524         finish_current_insert(trans, extent_root);
1525         return ret;
1526 }
1527
1528 /*
1529  * find all the blocks marked as pending in the radix tree and remove
1530  * them from the extent map
1531  */
1532 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
1533                                btrfs_root *extent_root)
1534 {
1535         int ret;
1536         int err = 0;
1537         u64 start;
1538         u64 end;
1539         struct extent_io_tree *pending_del;
1540         struct extent_io_tree *pinned_extents;
1541
1542         pending_del = &extent_root->fs_info->pending_del;
1543         pinned_extents = &extent_root->fs_info->pinned_extents;
1544
1545         while(1) {
1546                 ret = find_first_extent_bit(pending_del, 0, &start, &end,
1547                                             EXTENT_LOCKED);
1548                 if (ret)
1549                         break;
1550                 update_pinned_extents(extent_root, start, end + 1 - start, 1);
1551                 clear_extent_bits(pending_del, start, end, EXTENT_LOCKED,
1552                                   GFP_NOFS);
1553                 ret = __free_extent(trans, extent_root,
1554                                      start, end + 1 - start,
1555                                      extent_root->root_key.objectid,
1556                                      0, 0, 0, 0, 0);
1557                 if (ret)
1558                         err = ret;
1559         }
1560         return err;
1561 }
1562
1563 /*
1564  * remove an extent from the root, returns 0 on success
1565  */
1566 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
1567                       *root, u64 bytenr, u64 num_bytes,
1568                       u64 root_objectid, u64 ref_generation,
1569                       u64 owner_objectid, u64 owner_offset, int pin)
1570 {
1571         struct btrfs_root *extent_root = root->fs_info->extent_root;
1572         int pending_ret;
1573         int ret;
1574
1575         WARN_ON(num_bytes < root->sectorsize);
1576         if (!root->ref_cows)
1577                 ref_generation = 0;
1578
1579         if (root == extent_root) {
1580                 pin_down_bytes(root, bytenr, num_bytes, 1);
1581                 return 0;
1582         }
1583         ret = __free_extent(trans, root, bytenr, num_bytes, root_objectid,
1584                             ref_generation, owner_objectid, owner_offset,
1585                             pin, pin == 0);
1586         pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
1587         return ret ? ret : pending_ret;
1588 }
1589
1590 static u64 stripe_align(struct btrfs_root *root, u64 val)
1591 {
1592         u64 mask = ((u64)root->stripesize - 1);
1593         u64 ret = (val + mask) & ~mask;
1594         return ret;
1595 }
1596
1597 /*
1598  * walks the btree of allocated extents and find a hole of a given size.
1599  * The key ins is changed to record the hole:
1600  * ins->objectid == block start
1601  * ins->flags = BTRFS_EXTENT_ITEM_KEY
1602  * ins->offset == number of blocks
1603  * Any available blocks before search_start are skipped.
1604  */
1605 static int noinline find_free_extent(struct btrfs_trans_handle *trans,
1606                                      struct btrfs_root *orig_root,
1607                                      u64 num_bytes, u64 empty_size,
1608                                      u64 search_start, u64 search_end,
1609                                      u64 hint_byte, struct btrfs_key *ins,
1610                                      u64 exclude_start, u64 exclude_nr,
1611                                      int data)
1612 {
1613         int ret;
1614         u64 orig_search_start;
1615         struct btrfs_root * root = orig_root->fs_info->extent_root;
1616         struct btrfs_fs_info *info = root->fs_info;
1617         u64 total_needed = num_bytes;
1618         u64 *last_ptr = NULL;
1619         struct btrfs_block_group_cache *block_group;
1620         int full_scan = 0;
1621         int wrapped = 0;
1622         int empty_cluster = 2 * 1024 * 1024;
1623
1624         WARN_ON(num_bytes < root->sectorsize);
1625         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
1626
1627         if (data & BTRFS_BLOCK_GROUP_METADATA) {
1628                 last_ptr = &root->fs_info->last_alloc;
1629                 empty_cluster = 256 * 1024;
1630         }
1631
1632         if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
1633                 last_ptr = &root->fs_info->last_data_alloc;
1634         }
1635
1636         if (last_ptr) {
1637                 if (*last_ptr)
1638                         hint_byte = *last_ptr;
1639                 else {
1640                         empty_size += empty_cluster;
1641                 }
1642         }
1643
1644         search_start = max(search_start, first_logical_byte(root, 0));
1645         orig_search_start = search_start;
1646
1647         if (search_end == (u64)-1)
1648                 search_end = btrfs_super_total_bytes(&info->super_copy);
1649
1650         if (hint_byte) {
1651                 block_group = btrfs_lookup_block_group(info, hint_byte);
1652                 if (!block_group)
1653                         hint_byte = search_start;
1654                 block_group = btrfs_find_block_group(root, block_group,
1655                                                      hint_byte, data, 1);
1656                 if (last_ptr && *last_ptr == 0 && block_group)
1657                         hint_byte = block_group->key.objectid;
1658         } else {
1659                 block_group = btrfs_find_block_group(root,
1660                                                      trans->block_group,
1661                                                      search_start, data, 1);
1662         }
1663         search_start = max(search_start, hint_byte);
1664
1665         total_needed += empty_size;
1666
1667 check_failed:
1668         if (!block_group) {
1669                 block_group = btrfs_lookup_block_group(info, search_start);
1670                 if (!block_group)
1671                         block_group = btrfs_lookup_block_group(info,
1672                                                        orig_search_start);
1673         }
1674         ret = find_search_start(root, &block_group, &search_start,
1675                                 total_needed, data);
1676         if (ret == -ENOSPC && last_ptr && *last_ptr) {
1677                 *last_ptr = 0;
1678                 block_group = btrfs_lookup_block_group(info,
1679                                                        orig_search_start);
1680                 search_start = orig_search_start;
1681                 ret = find_search_start(root, &block_group, &search_start,
1682                                         total_needed, data);
1683         }
1684         if (ret == -ENOSPC)
1685                 goto enospc;
1686         if (ret)
1687                 goto error;
1688
1689         if (last_ptr && *last_ptr && search_start != *last_ptr) {
1690                 *last_ptr = 0;
1691                 if (!empty_size) {
1692                         empty_size += empty_cluster;
1693                         total_needed += empty_size;
1694                 }
1695                 block_group = btrfs_lookup_block_group(info,
1696                                                        orig_search_start);
1697                 search_start = orig_search_start;
1698                 ret = find_search_start(root, &block_group,
1699                                         &search_start, total_needed, data);
1700                 if (ret == -ENOSPC)
1701                         goto enospc;
1702                 if (ret)
1703                         goto error;
1704         }
1705
1706         search_start = stripe_align(root, search_start);
1707         ins->objectid = search_start;
1708         ins->offset = num_bytes;
1709
1710         if (ins->objectid + num_bytes >= search_end)
1711                 goto enospc;
1712
1713         if (ins->objectid + num_bytes >
1714             block_group->key.objectid + block_group->key.offset) {
1715                 search_start = block_group->key.objectid +
1716                         block_group->key.offset;
1717                 goto new_group;
1718         }
1719
1720         if (test_range_bit(&info->extent_ins, ins->objectid,
1721                            ins->objectid + num_bytes -1, EXTENT_LOCKED, 0)) {
1722                 search_start = ins->objectid + num_bytes;
1723                 goto new_group;
1724         }
1725
1726         if (test_range_bit(&info->pinned_extents, ins->objectid,
1727                            ins->objectid + num_bytes -1, EXTENT_DIRTY, 0)) {
1728                 search_start = ins->objectid + num_bytes;
1729                 goto new_group;
1730         }
1731
1732         if (exclude_nr > 0 && (ins->objectid + num_bytes > exclude_start &&
1733             ins->objectid < exclude_start + exclude_nr)) {
1734                 search_start = exclude_start + exclude_nr;
1735                 goto new_group;
1736         }
1737
1738         if (!(data & BTRFS_BLOCK_GROUP_DATA)) {
1739                 block_group = btrfs_lookup_block_group(info, ins->objectid);
1740                 if (block_group)
1741                         trans->block_group = block_group;
1742         }
1743         ins->offset = num_bytes;
1744         if (last_ptr) {
1745                 *last_ptr = ins->objectid + ins->offset;
1746                 if (*last_ptr ==
1747                     btrfs_super_total_bytes(&root->fs_info->super_copy)) {
1748                         *last_ptr = 0;
1749                 }
1750         }
1751         return 0;
1752
1753 new_group:
1754         if (search_start + num_bytes >= search_end) {
1755 enospc:
1756                 search_start = orig_search_start;
1757                 if (full_scan) {
1758                         ret = -ENOSPC;
1759                         goto error;
1760                 }
1761                 if (wrapped) {
1762                         if (!full_scan)
1763                                 total_needed -= empty_size;
1764                         full_scan = 1;
1765                 } else
1766                         wrapped = 1;
1767         }
1768         block_group = btrfs_lookup_block_group(info, search_start);
1769         cond_resched();
1770         block_group = btrfs_find_block_group(root, block_group,
1771                                              search_start, data, 0);
1772         goto check_failed;
1773
1774 error:
1775         return ret;
1776 }
1777
1778 /*
1779  * finds a free extent and does all the dirty work required for allocation
1780  * returns the key for the extent through ins, and a tree buffer for
1781  * the first block of the extent through buf.
1782  *
1783  * returns 0 if everything worked, non-zero otherwise.
1784  */
1785 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
1786                        struct btrfs_root *root,
1787                        u64 num_bytes, u64 min_alloc_size,
1788                        u64 root_objectid, u64 ref_generation,
1789                        u64 owner, u64 owner_offset,
1790                        u64 empty_size, u64 hint_byte,
1791                        u64 search_end, struct btrfs_key *ins, u64 data)
1792 {
1793         int ret;
1794         int pending_ret;
1795         u64 super_used;
1796         u64 root_used;
1797         u64 search_start = 0;
1798         u64 alloc_profile;
1799         u32 sizes[2];
1800         struct btrfs_fs_info *info = root->fs_info;
1801         struct btrfs_root *extent_root = info->extent_root;
1802         struct btrfs_extent_item *extent_item;
1803         struct btrfs_extent_ref *ref;
1804         struct btrfs_path *path;
1805         struct btrfs_key keys[2];
1806
1807         if (data) {
1808                 alloc_profile = info->avail_data_alloc_bits &
1809                                 info->data_alloc_profile;
1810                 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
1811         } else if (root == root->fs_info->chunk_root) {
1812                 alloc_profile = info->avail_system_alloc_bits &
1813                                 info->system_alloc_profile;
1814                 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
1815         } else {
1816                 alloc_profile = info->avail_metadata_alloc_bits &
1817                                 info->metadata_alloc_profile;
1818                 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
1819         }
1820 again:
1821         data = reduce_alloc_profile(root, data);
1822         if (root->ref_cows) {
1823                 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
1824                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
1825                                              2 * 1024 * 1024,
1826                                              BTRFS_BLOCK_GROUP_METADATA |
1827                                              (info->metadata_alloc_profile &
1828                                               info->avail_metadata_alloc_bits));
1829                         BUG_ON(ret);
1830                 }
1831                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
1832                                      num_bytes + 2 * 1024 * 1024, data);
1833                 BUG_ON(ret);
1834         }
1835
1836         WARN_ON(num_bytes < root->sectorsize);
1837         ret = find_free_extent(trans, root, num_bytes, empty_size,
1838                                search_start, search_end, hint_byte, ins,
1839                                trans->alloc_exclude_start,
1840                                trans->alloc_exclude_nr, data);
1841
1842         if (ret == -ENOSPC && num_bytes > min_alloc_size) {
1843                 num_bytes = num_bytes >> 1;
1844                 num_bytes = max(num_bytes, min_alloc_size);
1845                 goto again;
1846         }
1847         if (ret) {
1848                 printk("allocation failed flags %Lu\n", data);
1849         }
1850         BUG_ON(ret);
1851         if (ret)
1852                 return ret;
1853
1854         /* block accounting for super block */
1855         super_used = btrfs_super_bytes_used(&info->super_copy);
1856         btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
1857
1858         /* block accounting for root item */
1859         root_used = btrfs_root_used(&root->root_item);
1860         btrfs_set_root_used(&root->root_item, root_used + num_bytes);
1861
1862         clear_extent_dirty(&root->fs_info->free_space_cache,
1863                            ins->objectid, ins->objectid + ins->offset - 1,
1864                            GFP_NOFS);
1865
1866         if (root == extent_root) {
1867                 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
1868                                 ins->objectid + ins->offset - 1,
1869                                 EXTENT_LOCKED, GFP_NOFS);
1870                 goto update_block;
1871         }
1872
1873         WARN_ON(trans->alloc_exclude_nr);
1874         trans->alloc_exclude_start = ins->objectid;
1875         trans->alloc_exclude_nr = ins->offset;
1876
1877         memcpy(&keys[0], ins, sizeof(*ins));
1878         keys[1].offset = hash_extent_ref(root_objectid, ref_generation,
1879                                          owner, owner_offset);
1880         keys[1].objectid = ins->objectid;
1881         keys[1].type = BTRFS_EXTENT_REF_KEY;
1882         sizes[0] = sizeof(*extent_item);
1883         sizes[1] = sizeof(*ref);
1884
1885         path = btrfs_alloc_path();
1886         BUG_ON(!path);
1887
1888         ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
1889                                        sizes, 2);
1890
1891         BUG_ON(ret);
1892         extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1893                                      struct btrfs_extent_item);
1894         btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
1895         ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
1896                              struct btrfs_extent_ref);
1897
1898         btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
1899         btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
1900         btrfs_set_ref_objectid(path->nodes[0], ref, owner);
1901         btrfs_set_ref_offset(path->nodes[0], ref, owner_offset);
1902
1903         btrfs_mark_buffer_dirty(path->nodes[0]);
1904
1905         trans->alloc_exclude_start = 0;
1906         trans->alloc_exclude_nr = 0;
1907         btrfs_free_path(path);
1908         finish_current_insert(trans, extent_root);
1909         pending_ret = del_pending_extents(trans, extent_root);
1910
1911         if (ret) {
1912                 return ret;
1913         }
1914         if (pending_ret) {
1915                 return pending_ret;
1916         }
1917
1918 update_block:
1919         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0);
1920         if (ret) {
1921                 printk("update block group failed for %Lu %Lu\n",
1922                        ins->objectid, ins->offset);
1923                 BUG();
1924         }
1925         return 0;
1926 }
1927
1928 /*
1929  * helper function to allocate a block for a given tree
1930  * returns the tree buffer or NULL.
1931  */
1932 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1933                                              struct btrfs_root *root,
1934                                              u32 blocksize,
1935                                              u64 root_objectid, u64 hint,
1936                                              u64 empty_size)
1937 {
1938         u64 ref_generation;
1939
1940         if (root->ref_cows)
1941                 ref_generation = trans->transid;
1942         else
1943                 ref_generation = 0;
1944
1945
1946         return __btrfs_alloc_free_block(trans, root, blocksize, root_objectid,
1947                                         ref_generation, 0, 0, hint, empty_size);
1948 }
1949
1950 /*
1951  * helper function to allocate a block for a given tree
1952  * returns the tree buffer or NULL.
1953  */
1954 struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1955                                              struct btrfs_root *root,
1956                                              u32 blocksize,
1957                                              u64 root_objectid,
1958                                              u64 ref_generation,
1959                                              u64 first_objectid,
1960                                              int level,
1961                                              u64 hint,
1962                                              u64 empty_size)
1963 {
1964         struct btrfs_key ins;
1965         int ret;
1966         struct extent_buffer *buf;
1967
1968         ret = btrfs_alloc_extent(trans, root, blocksize, blocksize,
1969                                  root_objectid, ref_generation,
1970                                  level, first_objectid, empty_size, hint,
1971                                  (u64)-1, &ins, 0);
1972         if (ret) {
1973                 BUG_ON(ret > 0);
1974                 return ERR_PTR(ret);
1975         }
1976         buf = btrfs_find_create_tree_block(root, ins.objectid, blocksize);
1977         if (!buf) {
1978                 btrfs_free_extent(trans, root, ins.objectid, blocksize,
1979                                   root->root_key.objectid, ref_generation,
1980                                   0, 0, 0);
1981                 return ERR_PTR(-ENOMEM);
1982         }
1983         btrfs_set_header_generation(buf, trans->transid);
1984         clean_tree_block(trans, root, buf);
1985         btrfs_set_buffer_uptodate(buf);
1986
1987         if (PageDirty(buf->first_page)) {
1988                 printk("page %lu dirty\n", buf->first_page->index);
1989                 WARN_ON(1);
1990         }
1991
1992         set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
1993                          buf->start + buf->len - 1, GFP_NOFS);
1994         if (!btrfs_test_opt(root, SSD))
1995                 btrfs_set_buffer_defrag(buf);
1996         trans->blocks_used++;
1997         return buf;
1998 }
1999
2000 static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
2001                                   struct btrfs_root *root,
2002                                   struct extent_buffer *leaf)
2003 {
2004         u64 leaf_owner;
2005         u64 leaf_generation;
2006         struct btrfs_key key;
2007         struct btrfs_file_extent_item *fi;
2008         int i;
2009         int nritems;
2010         int ret;
2011
2012         BUG_ON(!btrfs_is_leaf(leaf));
2013         nritems = btrfs_header_nritems(leaf);
2014         leaf_owner = btrfs_header_owner(leaf);
2015         leaf_generation = btrfs_header_generation(leaf);
2016
2017         for (i = 0; i < nritems; i++) {
2018                 u64 disk_bytenr;
2019
2020                 btrfs_item_key_to_cpu(leaf, &key, i);
2021                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2022                         continue;
2023                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2024                 if (btrfs_file_extent_type(leaf, fi) ==
2025                     BTRFS_FILE_EXTENT_INLINE)
2026                         continue;
2027                 /*
2028                  * FIXME make sure to insert a trans record that
2029                  * repeats the snapshot del on crash
2030                  */
2031                 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2032                 if (disk_bytenr == 0)
2033                         continue;
2034                 ret = btrfs_free_extent(trans, root, disk_bytenr,
2035                                 btrfs_file_extent_disk_num_bytes(leaf, fi),
2036                                 leaf_owner, leaf_generation,
2037                                 key.objectid, key.offset, 0);
2038                 BUG_ON(ret);
2039         }
2040         return 0;
2041 }
2042
2043 static void noinline reada_walk_down(struct btrfs_root *root,
2044                                      struct extent_buffer *node,
2045                                      int slot)
2046 {
2047         u64 bytenr;
2048         u64 last = 0;
2049         u32 nritems;
2050         u32 refs;
2051         u32 blocksize;
2052         int ret;
2053         int i;
2054         int level;
2055         int skipped = 0;
2056
2057         nritems = btrfs_header_nritems(node);
2058         level = btrfs_header_level(node);
2059         if (level)
2060                 return;
2061
2062         for (i = slot; i < nritems && skipped < 32; i++) {
2063                 bytenr = btrfs_node_blockptr(node, i);
2064                 if (last && ((bytenr > last && bytenr - last > 32 * 1024) ||
2065                              (last > bytenr && last - bytenr > 32 * 1024))) {
2066                         skipped++;
2067                         continue;
2068                 }
2069                 blocksize = btrfs_level_size(root, level - 1);
2070                 if (i != slot) {
2071                         ret = lookup_extent_ref(NULL, root, bytenr,
2072                                                 blocksize, &refs);
2073                         BUG_ON(ret);
2074                         if (refs != 1) {
2075                                 skipped++;
2076                                 continue;
2077                         }
2078                 }
2079                 mutex_unlock(&root->fs_info->fs_mutex);
2080                 ret = readahead_tree_block(root, bytenr, blocksize,
2081                                            btrfs_node_ptr_generation(node, i));
2082                 last = bytenr + blocksize;
2083                 cond_resched();
2084                 mutex_lock(&root->fs_info->fs_mutex);
2085                 if (ret)
2086                         break;
2087         }
2088 }
2089
2090 /*
2091  * helper function for drop_snapshot, this walks down the tree dropping ref
2092  * counts as it goes.
2093  */
2094 static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
2095                                    struct btrfs_root *root,
2096                                    struct btrfs_path *path, int *level)
2097 {
2098         u64 root_owner;
2099         u64 root_gen;
2100         u64 bytenr;
2101         u64 ptr_gen;
2102         struct extent_buffer *next;
2103         struct extent_buffer *cur;
2104         struct extent_buffer *parent;
2105         u32 blocksize;
2106         int ret;
2107         u32 refs;
2108
2109         WARN_ON(*level < 0);
2110         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2111         ret = lookup_extent_ref(trans, root,
2112                                 path->nodes[*level]->start,
2113                                 path->nodes[*level]->len, &refs);
2114         BUG_ON(ret);
2115         if (refs > 1)
2116                 goto out;
2117
2118         /*
2119          * walk down to the last node level and free all the leaves
2120          */
2121         while(*level >= 0) {
2122                 WARN_ON(*level < 0);
2123                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2124                 cur = path->nodes[*level];
2125
2126                 if (btrfs_header_level(cur) != *level)
2127                         WARN_ON(1);
2128
2129                 if (path->slots[*level] >=
2130                     btrfs_header_nritems(cur))
2131                         break;
2132                 if (*level == 0) {
2133                         ret = drop_leaf_ref(trans, root, cur);
2134                         BUG_ON(ret);
2135                         break;
2136                 }
2137                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2138                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2139                 blocksize = btrfs_level_size(root, *level - 1);
2140                 ret = lookup_extent_ref(trans, root, bytenr, blocksize, &refs);
2141                 BUG_ON(ret);
2142                 if (refs != 1) {
2143                         parent = path->nodes[*level];
2144                         root_owner = btrfs_header_owner(parent);
2145                         root_gen = btrfs_header_generation(parent);
2146                         path->slots[*level]++;
2147                         ret = btrfs_free_extent(trans, root, bytenr,
2148                                                 blocksize, root_owner,
2149                                                 root_gen, 0, 0, 1);
2150                         BUG_ON(ret);
2151                         continue;
2152                 }
2153                 next = btrfs_find_tree_block(root, bytenr, blocksize);
2154                 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
2155                         free_extent_buffer(next);
2156                         reada_walk_down(root, cur, path->slots[*level]);
2157
2158                         mutex_unlock(&root->fs_info->fs_mutex);
2159                         next = read_tree_block(root, bytenr, blocksize,
2160                                                ptr_gen);
2161                         mutex_lock(&root->fs_info->fs_mutex);
2162
2163                         /* we've dropped the lock, double check */
2164                         ret = lookup_extent_ref(trans, root, bytenr,
2165                                                 blocksize, &refs);
2166                         BUG_ON(ret);
2167                         if (refs != 1) {
2168                                 parent = path->nodes[*level];
2169                                 root_owner = btrfs_header_owner(parent);
2170                                 root_gen = btrfs_header_generation(parent);
2171
2172                                 path->slots[*level]++;
2173                                 free_extent_buffer(next);
2174                                 ret = btrfs_free_extent(trans, root, bytenr,
2175                                                         blocksize,
2176                                                         root_owner,
2177                                                         root_gen, 0, 0, 1);
2178                                 BUG_ON(ret);
2179                                 continue;
2180                         }
2181                 }
2182                 WARN_ON(*level <= 0);
2183                 if (path->nodes[*level-1])
2184                         free_extent_buffer(path->nodes[*level-1]);
2185                 path->nodes[*level-1] = next;
2186                 *level = btrfs_header_level(next);
2187                 path->slots[*level] = 0;
2188         }
2189 out:
2190         WARN_ON(*level < 0);
2191         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2192
2193         if (path->nodes[*level] == root->node) {
2194                 root_owner = root->root_key.objectid;
2195                 parent = path->nodes[*level];
2196         } else {
2197                 parent = path->nodes[*level + 1];
2198                 root_owner = btrfs_header_owner(parent);
2199         }
2200
2201         root_gen = btrfs_header_generation(parent);
2202         ret = btrfs_free_extent(trans, root, path->nodes[*level]->start,
2203                                 path->nodes[*level]->len,
2204                                 root_owner, root_gen, 0, 0, 1);
2205         free_extent_buffer(path->nodes[*level]);
2206         path->nodes[*level] = NULL;
2207         *level += 1;
2208         BUG_ON(ret);
2209         return 0;
2210 }
2211
2212 /*
2213  * helper for dropping snapshots.  This walks back up the tree in the path
2214  * to find the first node higher up where we haven't yet gone through
2215  * all the slots
2216  */
2217 static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
2218                                  struct btrfs_root *root,
2219                                  struct btrfs_path *path, int *level)
2220 {
2221         u64 root_owner;
2222         u64 root_gen;
2223         struct btrfs_root_item *root_item = &root->root_item;
2224         int i;
2225         int slot;
2226         int ret;
2227
2228         for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2229                 slot = path->slots[i];
2230                 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
2231                         struct extent_buffer *node;
2232                         struct btrfs_disk_key disk_key;
2233                         node = path->nodes[i];
2234                         path->slots[i]++;
2235                         *level = i;
2236                         WARN_ON(*level == 0);
2237                         btrfs_node_key(node, &disk_key, path->slots[i]);
2238                         memcpy(&root_item->drop_progress,
2239                                &disk_key, sizeof(disk_key));
2240                         root_item->drop_level = i;
2241                         return 0;
2242                 } else {
2243                         if (path->nodes[*level] == root->node) {
2244                                 root_owner = root->root_key.objectid;
2245                                 root_gen =
2246                                    btrfs_header_generation(path->nodes[*level]);
2247                         } else {
2248                                 struct extent_buffer *node;
2249                                 node = path->nodes[*level + 1];
2250                                 root_owner = btrfs_header_owner(node);
2251                                 root_gen = btrfs_header_generation(node);
2252                         }
2253                         ret = btrfs_free_extent(trans, root,
2254                                                 path->nodes[*level]->start,
2255                                                 path->nodes[*level]->len,
2256                                                 root_owner, root_gen, 0, 0, 1);
2257                         BUG_ON(ret);
2258                         free_extent_buffer(path->nodes[*level]);
2259                         path->nodes[*level] = NULL;
2260                         *level = i + 1;
2261                 }
2262         }
2263         return 1;
2264 }
2265
2266 /*
2267  * drop the reference count on the tree rooted at 'snap'.  This traverses
2268  * the tree freeing any blocks that have a ref count of zero after being
2269  * decremented.
2270  */
2271 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
2272                         *root)
2273 {
2274         int ret = 0;
2275         int wret;
2276         int level;
2277         struct btrfs_path *path;
2278         int i;
2279         int orig_level;
2280         struct btrfs_root_item *root_item = &root->root_item;
2281
2282         path = btrfs_alloc_path();
2283         BUG_ON(!path);
2284
2285         level = btrfs_header_level(root->node);
2286         orig_level = level;
2287         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2288                 path->nodes[level] = root->node;
2289                 extent_buffer_get(root->node);
2290                 path->slots[level] = 0;
2291         } else {
2292                 struct btrfs_key key;
2293                 struct btrfs_disk_key found_key;
2294                 struct extent_buffer *node;
2295
2296                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2297                 level = root_item->drop_level;
2298                 path->lowest_level = level;
2299                 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2300                 if (wret < 0) {
2301                         ret = wret;
2302                         goto out;
2303                 }
2304                 node = path->nodes[level];
2305                 btrfs_node_key(node, &found_key, path->slots[level]);
2306                 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
2307                                sizeof(found_key)));
2308         }
2309         while(1) {
2310                 wret = walk_down_tree(trans, root, path, &level);
2311                 if (wret > 0)
2312                         break;
2313                 if (wret < 0)
2314                         ret = wret;
2315
2316                 wret = walk_up_tree(trans, root, path, &level);
2317                 if (wret > 0)
2318                         break;
2319                 if (wret < 0)
2320                         ret = wret;
2321                 ret = -EAGAIN;
2322                 break;
2323         }
2324         for (i = 0; i <= orig_level; i++) {
2325                 if (path->nodes[i]) {
2326                         free_extent_buffer(path->nodes[i]);
2327                         path->nodes[i] = NULL;
2328                 }
2329         }
2330 out:
2331         btrfs_free_path(path);
2332         return ret;
2333 }
2334
2335 int btrfs_free_block_groups(struct btrfs_fs_info *info)
2336 {
2337         u64 start;
2338         u64 end;
2339         u64 ptr;
2340         int ret;
2341         while(1) {
2342                 ret = find_first_extent_bit(&info->block_group_cache, 0,
2343                                             &start, &end, (unsigned int)-1);
2344                 if (ret)
2345                         break;
2346                 ret = get_state_private(&info->block_group_cache, start, &ptr);
2347                 if (!ret)
2348                         kfree((void *)(unsigned long)ptr);
2349                 clear_extent_bits(&info->block_group_cache, start,
2350                                   end, (unsigned int)-1, GFP_NOFS);
2351         }
2352         while(1) {
2353                 ret = find_first_extent_bit(&info->free_space_cache, 0,
2354                                             &start, &end, EXTENT_DIRTY);
2355                 if (ret)
2356                         break;
2357                 clear_extent_dirty(&info->free_space_cache, start,
2358                                    end, GFP_NOFS);
2359         }
2360         return 0;
2361 }
2362
2363 static unsigned long calc_ra(unsigned long start, unsigned long last,
2364                              unsigned long nr)
2365 {
2366         return min(last, start + nr - 1);
2367 }
2368
2369 static int noinline relocate_inode_pages(struct inode *inode, u64 start,
2370                                          u64 len)
2371 {
2372         u64 page_start;
2373         u64 page_end;
2374         unsigned long last_index;
2375         unsigned long i;
2376         struct page *page;
2377         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2378         struct file_ra_state *ra;
2379         unsigned long total_read = 0;
2380         unsigned long ra_pages;
2381         struct btrfs_trans_handle *trans;
2382
2383         ra = kzalloc(sizeof(*ra), GFP_NOFS);
2384
2385         mutex_lock(&inode->i_mutex);
2386         i = start >> PAGE_CACHE_SHIFT;
2387         last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
2388
2389         ra_pages = BTRFS_I(inode)->root->fs_info->bdi.ra_pages;
2390
2391         file_ra_state_init(ra, inode->i_mapping);
2392
2393         for (; i <= last_index; i++) {
2394                 if (total_read % ra_pages == 0) {
2395                         btrfs_force_ra(inode->i_mapping, ra, NULL, i,
2396                                        calc_ra(i, last_index, ra_pages));
2397                 }
2398                 total_read++;
2399                 if (((u64)i << PAGE_CACHE_SHIFT) > inode->i_size)
2400                         goto truncate_racing;
2401
2402                 page = grab_cache_page(inode->i_mapping, i);
2403                 if (!page) {
2404                         goto out_unlock;
2405                 }
2406                 if (!PageUptodate(page)) {
2407                         btrfs_readpage(NULL, page);
2408                         lock_page(page);
2409                         if (!PageUptodate(page)) {
2410                                 unlock_page(page);
2411                                 page_cache_release(page);
2412                                 goto out_unlock;
2413                         }
2414                 }
2415 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
2416                 ClearPageDirty(page);
2417 #else
2418                 cancel_dirty_page(page, PAGE_CACHE_SIZE);
2419 #endif
2420                 wait_on_page_writeback(page);
2421                 set_page_extent_mapped(page);
2422                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2423                 page_end = page_start + PAGE_CACHE_SIZE - 1;
2424
2425                 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2426
2427                 set_extent_delalloc(io_tree, page_start,
2428                                     page_end, GFP_NOFS);
2429                 set_page_dirty(page);
2430
2431                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2432                 unlock_page(page);
2433                 page_cache_release(page);
2434         }
2435         balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2436                                            total_read);
2437
2438 out_unlock:
2439         kfree(ra);
2440         trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
2441         if (trans) {
2442                 btrfs_add_ordered_inode(inode);
2443                 btrfs_end_transaction(trans, BTRFS_I(inode)->root);
2444                 mark_inode_dirty(inode);
2445         }
2446         mutex_unlock(&inode->i_mutex);
2447         return 0;
2448
2449 truncate_racing:
2450         vmtruncate(inode, inode->i_size);
2451         balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2452                                            total_read);
2453         goto out_unlock;
2454 }
2455
2456 /*
2457  * The back references tell us which tree holds a ref on a block,
2458  * but it is possible for the tree root field in the reference to
2459  * reflect the original root before a snapshot was made.  In this
2460  * case we should search through all the children of a given root
2461  * to find potential holders of references on a block.
2462  *
2463  * Instead, we do something a little less fancy and just search
2464  * all the roots for a given key/block combination.
2465  */
2466 static int find_root_for_ref(struct btrfs_root *root,
2467                              struct btrfs_path *path,
2468                              struct btrfs_key *key0,
2469                              int level,
2470                              int file_key,
2471                              struct btrfs_root **found_root,
2472                              u64 bytenr)
2473 {
2474         struct btrfs_key root_location;
2475         struct btrfs_root *cur_root = *found_root;
2476         struct btrfs_file_extent_item *file_extent;
2477         u64 root_search_start = BTRFS_FS_TREE_OBJECTID;
2478         u64 found_bytenr;
2479         int ret;
2480         int i;
2481
2482         root_location.offset = (u64)-1;
2483         root_location.type = BTRFS_ROOT_ITEM_KEY;
2484         path->lowest_level = level;
2485         path->reada = 0;
2486         while(1) {
2487                 ret = btrfs_search_slot(NULL, cur_root, key0, path, 0, 0);
2488                 found_bytenr = 0;
2489                 if (ret == 0 && file_key) {
2490                         struct extent_buffer *leaf = path->nodes[0];
2491                         file_extent = btrfs_item_ptr(leaf, path->slots[0],
2492                                              struct btrfs_file_extent_item);
2493                         if (btrfs_file_extent_type(leaf, file_extent) ==
2494                             BTRFS_FILE_EXTENT_REG) {
2495                                 found_bytenr =
2496                                         btrfs_file_extent_disk_bytenr(leaf,
2497                                                                file_extent);
2498                        }
2499                 } else if (!file_key) {
2500                         if (path->nodes[level])
2501                                 found_bytenr = path->nodes[level]->start;
2502                 }
2503
2504                 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2505                         if (!path->nodes[i])
2506                                 break;
2507                         free_extent_buffer(path->nodes[i]);
2508                         path->nodes[i] = NULL;
2509                 }
2510                 btrfs_release_path(cur_root, path);
2511
2512                 if (found_bytenr == bytenr) {
2513                         *found_root = cur_root;
2514                         ret = 0;
2515                         goto out;
2516                 }
2517                 ret = btrfs_search_root(root->fs_info->tree_root,
2518                                         root_search_start, &root_search_start);
2519                 if (ret)
2520                         break;
2521
2522                 root_location.objectid = root_search_start;
2523                 cur_root = btrfs_read_fs_root_no_name(root->fs_info,
2524                                                       &root_location);
2525                 if (!cur_root) {
2526                         ret = 1;
2527                         break;
2528                 }
2529         }
2530 out:
2531         path->lowest_level = 0;
2532         return ret;
2533 }
2534
2535 /*
2536  * note, this releases the path
2537  */
2538 static int noinline relocate_one_reference(struct btrfs_root *extent_root,
2539                                   struct btrfs_path *path,
2540                                   struct btrfs_key *extent_key)
2541 {
2542         struct inode *inode;
2543         struct btrfs_root *found_root;
2544         struct btrfs_key root_location;
2545         struct btrfs_key found_key;
2546         struct btrfs_extent_ref *ref;
2547         u64 ref_root;
2548         u64 ref_gen;
2549         u64 ref_objectid;
2550         u64 ref_offset;
2551         int ret;
2552         int level;
2553
2554         ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
2555                              struct btrfs_extent_ref);
2556         ref_root = btrfs_ref_root(path->nodes[0], ref);
2557         ref_gen = btrfs_ref_generation(path->nodes[0], ref);
2558         ref_objectid = btrfs_ref_objectid(path->nodes[0], ref);
2559         ref_offset = btrfs_ref_offset(path->nodes[0], ref);
2560         btrfs_release_path(extent_root, path);
2561
2562         root_location.objectid = ref_root;
2563         if (ref_gen == 0)
2564                 root_location.offset = 0;
2565         else
2566                 root_location.offset = (u64)-1;
2567         root_location.type = BTRFS_ROOT_ITEM_KEY;
2568
2569         found_root = btrfs_read_fs_root_no_name(extent_root->fs_info,
2570                                                 &root_location);
2571         BUG_ON(!found_root);
2572
2573         if (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2574                 found_key.objectid = ref_objectid;
2575                 found_key.type = BTRFS_EXTENT_DATA_KEY;
2576                 found_key.offset = ref_offset;
2577                 level = 0;
2578
2579                 ret = find_root_for_ref(extent_root, path, &found_key,
2580                                         level, 1, &found_root,
2581                                         extent_key->objectid);
2582
2583                 if (ret)
2584                         goto out;
2585
2586                 mutex_unlock(&extent_root->fs_info->fs_mutex);
2587                 inode = btrfs_iget_locked(extent_root->fs_info->sb,
2588                                           ref_objectid, found_root);
2589                 if (inode->i_state & I_NEW) {
2590                         /* the inode and parent dir are two different roots */
2591                         BTRFS_I(inode)->root = found_root;
2592                         BTRFS_I(inode)->location.objectid = ref_objectid;
2593                         BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
2594                         BTRFS_I(inode)->location.offset = 0;
2595                         btrfs_read_locked_inode(inode);
2596                         unlock_new_inode(inode);
2597
2598                 }
2599                 /* this can happen if the reference is not against
2600                  * the latest version of the tree root
2601                  */
2602                 if (is_bad_inode(inode)) {
2603                         mutex_lock(&extent_root->fs_info->fs_mutex);
2604                         goto out;
2605                 }
2606                 relocate_inode_pages(inode, ref_offset, extent_key->offset);
2607                 iput(inode);
2608                 mutex_lock(&extent_root->fs_info->fs_mutex);
2609         } else {
2610                 struct btrfs_trans_handle *trans;
2611                 struct extent_buffer *eb;
2612                 int i;
2613
2614                 eb = read_tree_block(found_root, extent_key->objectid,
2615                                      extent_key->offset, 0);
2616                 level = btrfs_header_level(eb);
2617
2618                 if (level == 0)
2619                         btrfs_item_key_to_cpu(eb, &found_key, 0);
2620                 else
2621                         btrfs_node_key_to_cpu(eb, &found_key, 0);
2622
2623                 free_extent_buffer(eb);
2624
2625                 ret = find_root_for_ref(extent_root, path, &found_key,
2626                                         level, 0, &found_root,
2627                                         extent_key->objectid);
2628
2629                 if (ret)
2630                         goto out;
2631
2632                 trans = btrfs_start_transaction(found_root, 1);
2633
2634                 path->lowest_level = level;
2635                 path->reada = 2;
2636                 ret = btrfs_search_slot(trans, found_root, &found_key, path,
2637                                         0, 1);
2638                 path->lowest_level = 0;
2639                 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2640                         if (!path->nodes[i])
2641                                 break;
2642                         free_extent_buffer(path->nodes[i]);
2643                         path->nodes[i] = NULL;
2644                 }
2645                 btrfs_release_path(found_root, path);
2646                 btrfs_end_transaction(trans, found_root);
2647         }
2648
2649 out:
2650         return 0;
2651 }
2652
2653 static int noinline del_extent_zero(struct btrfs_root *extent_root,
2654                                     struct btrfs_path *path,
2655                                     struct btrfs_key *extent_key)
2656 {
2657         int ret;
2658         struct btrfs_trans_handle *trans;
2659
2660         trans = btrfs_start_transaction(extent_root, 1);
2661         ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
2662         if (ret > 0) {
2663                 ret = -EIO;
2664                 goto out;
2665         }
2666         if (ret < 0)
2667                 goto out;
2668         ret = btrfs_del_item(trans, extent_root, path);
2669 out:
2670         btrfs_end_transaction(trans, extent_root);
2671         return ret;
2672 }
2673
2674 static int noinline relocate_one_extent(struct btrfs_root *extent_root,
2675                                         struct btrfs_path *path,
2676                                         struct btrfs_key *extent_key)
2677 {
2678         struct btrfs_key key;
2679         struct btrfs_key found_key;
2680         struct extent_buffer *leaf;
2681         u32 nritems;
2682         u32 item_size;
2683         int ret = 0;
2684
2685         if (extent_key->objectid == 0) {
2686                 ret = del_extent_zero(extent_root, path, extent_key);
2687                 goto out;
2688         }
2689         key.objectid = extent_key->objectid;
2690         key.type = BTRFS_EXTENT_REF_KEY;
2691         key.offset = 0;
2692
2693         while(1) {
2694                 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2695
2696                 if (ret < 0)
2697                         goto out;
2698
2699                 ret = 0;
2700                 leaf = path->nodes[0];
2701                 nritems = btrfs_header_nritems(leaf);
2702                 if (path->slots[0] == nritems) {
2703                         ret = btrfs_next_leaf(extent_root, path);
2704                         if (ret > 0) {
2705                                 ret = 0;
2706                                 goto out;
2707                         }
2708                         if (ret < 0)
2709                                 goto out;
2710                         leaf = path->nodes[0];
2711                 }
2712
2713                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2714                 if (found_key.objectid != extent_key->objectid) {
2715                         break;
2716                 }
2717
2718                 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
2719                         break;
2720                 }
2721
2722                 key.offset = found_key.offset + 1;
2723                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2724
2725                 ret = relocate_one_reference(extent_root, path, extent_key);
2726                 if (ret)
2727                         goto out;
2728         }
2729         ret = 0;
2730 out:
2731         btrfs_release_path(extent_root, path);
2732         return ret;
2733 }
2734
2735 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
2736 {
2737         u64 num_devices;
2738         u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
2739                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
2740
2741         num_devices = root->fs_info->fs_devices->num_devices;
2742         if (num_devices == 1) {
2743                 stripped |= BTRFS_BLOCK_GROUP_DUP;
2744                 stripped = flags & ~stripped;
2745
2746                 /* turn raid0 into single device chunks */
2747                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
2748                         return stripped;
2749
2750                 /* turn mirroring into duplication */
2751                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
2752                              BTRFS_BLOCK_GROUP_RAID10))
2753                         return stripped | BTRFS_BLOCK_GROUP_DUP;
2754                 return flags;
2755         } else {
2756                 /* they already had raid on here, just return */
2757                 if (flags & stripped)
2758                         return flags;
2759
2760                 stripped |= BTRFS_BLOCK_GROUP_DUP;
2761                 stripped = flags & ~stripped;
2762
2763                 /* switch duplicated blocks with raid1 */
2764                 if (flags & BTRFS_BLOCK_GROUP_DUP)
2765                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
2766
2767                 /* turn single device chunks into raid0 */
2768                 return stripped | BTRFS_BLOCK_GROUP_RAID0;
2769         }
2770         return flags;
2771 }
2772
2773 int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
2774 {
2775         struct btrfs_trans_handle *trans;
2776         struct btrfs_root *tree_root = root->fs_info->tree_root;
2777         struct btrfs_path *path;
2778         u64 cur_byte;
2779         u64 total_found;
2780         u64 shrink_last_byte;
2781         u64 new_alloc_flags;
2782         struct btrfs_block_group_cache *shrink_block_group;
2783         struct btrfs_fs_info *info = root->fs_info;
2784         struct btrfs_key key;
2785         struct btrfs_key found_key;
2786         struct extent_buffer *leaf;
2787         u32 nritems;
2788         int ret;
2789         int progress;
2790
2791         shrink_block_group = btrfs_lookup_block_group(root->fs_info,
2792                                                       shrink_start);
2793         BUG_ON(!shrink_block_group);
2794
2795         shrink_last_byte = shrink_start + shrink_block_group->key.offset;
2796
2797         shrink_block_group->space_info->total_bytes -=
2798                 shrink_block_group->key.offset;
2799         path = btrfs_alloc_path();
2800         root = root->fs_info->extent_root;
2801         path->reada = 2;
2802
2803         printk("btrfs relocating block group %llu flags %llu\n",
2804                (unsigned long long)shrink_start,
2805                (unsigned long long)shrink_block_group->flags);
2806
2807 again:
2808         if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
2809                 u64 calc;
2810
2811                 trans = btrfs_start_transaction(root, 1);
2812                 new_alloc_flags = update_block_group_flags(root,
2813                                                    shrink_block_group->flags);
2814                 if (new_alloc_flags != shrink_block_group->flags) {
2815                         calc =
2816                              btrfs_block_group_used(&shrink_block_group->item);
2817                 } else {
2818                         calc = shrink_block_group->key.offset;
2819                 }
2820                 do_chunk_alloc(trans, root->fs_info->extent_root,
2821                                calc + 2 * 1024 * 1024, new_alloc_flags);
2822                 btrfs_end_transaction(trans, root);
2823         }
2824         shrink_block_group->ro = 1;
2825
2826         total_found = 0;
2827         progress = 0;
2828         key.objectid = shrink_start;
2829         key.offset = 0;
2830         key.type = 0;
2831         cur_byte = key.objectid;
2832
2833         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2834         if (ret < 0)
2835                 goto out;
2836
2837         ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
2838         if (ret < 0)
2839                 goto out;
2840
2841         if (ret == 0) {
2842                 leaf = path->nodes[0];
2843                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2844                 if (found_key.objectid + found_key.offset > shrink_start &&
2845                     found_key.objectid < shrink_last_byte) {
2846                         cur_byte = found_key.objectid;
2847                         key.objectid = cur_byte;
2848                 }
2849         }
2850         btrfs_release_path(root, path);
2851
2852         while(1) {
2853                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2854                 if (ret < 0)
2855                         goto out;
2856
2857                 leaf = path->nodes[0];
2858                 nritems = btrfs_header_nritems(leaf);
2859 next:
2860                 if (path->slots[0] >= nritems) {
2861                         ret = btrfs_next_leaf(root, path);
2862                         if (ret < 0)
2863                                 goto out;
2864                         if (ret == 1) {
2865                                 ret = 0;
2866                                 break;
2867                         }
2868                         leaf = path->nodes[0];
2869                         nritems = btrfs_header_nritems(leaf);
2870                 }
2871
2872                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2873
2874                 if (found_key.objectid >= shrink_last_byte)
2875                         break;
2876
2877                 if (progress && need_resched()) {
2878                         memcpy(&key, &found_key, sizeof(key));
2879                         mutex_unlock(&root->fs_info->fs_mutex);
2880                         cond_resched();
2881                         mutex_lock(&root->fs_info->fs_mutex);
2882                         btrfs_release_path(root, path);
2883                         btrfs_search_slot(NULL, root, &key, path, 0, 0);
2884                         progress = 0;
2885                         goto next;
2886                 }
2887                 progress = 1;
2888
2889                 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_ITEM_KEY ||
2890                     found_key.objectid + found_key.offset <= cur_byte) {
2891                         path->slots[0]++;
2892                         goto next;
2893                 }
2894
2895                 total_found++;
2896                 cur_byte = found_key.objectid + found_key.offset;
2897                 key.objectid = cur_byte;
2898                 btrfs_release_path(root, path);
2899                 ret = relocate_one_extent(root, path, &found_key);
2900         }
2901
2902         btrfs_release_path(root, path);
2903
2904         if (total_found > 0) {
2905                 printk("btrfs relocate found %llu last extent was %llu\n",
2906                        (unsigned long long)total_found,
2907                        (unsigned long long)found_key.objectid);
2908                 trans = btrfs_start_transaction(tree_root, 1);
2909                 btrfs_commit_transaction(trans, tree_root);
2910
2911                 mutex_unlock(&root->fs_info->fs_mutex);
2912                 btrfs_clean_old_snapshots(tree_root);
2913                 mutex_lock(&root->fs_info->fs_mutex);
2914
2915                 trans = btrfs_start_transaction(tree_root, 1);
2916                 btrfs_commit_transaction(trans, tree_root);
2917                 goto again;
2918         }
2919
2920         /*
2921          * we've freed all the extents, now remove the block
2922          * group item from the tree
2923          */
2924         trans = btrfs_start_transaction(root, 1);
2925         memcpy(&key, &shrink_block_group->key, sizeof(key));
2926
2927         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2928         if (ret > 0)
2929                 ret = -EIO;
2930         if (ret < 0)
2931                 goto out;
2932
2933         leaf = path->nodes[0];
2934         nritems = btrfs_header_nritems(leaf);
2935         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2936         kfree(shrink_block_group);
2937
2938         clear_extent_bits(&info->block_group_cache, found_key.objectid,
2939                           found_key.objectid + found_key.offset - 1,
2940                           (unsigned int)-1, GFP_NOFS);
2941
2942         btrfs_del_item(trans, root, path);
2943         clear_extent_dirty(&info->free_space_cache,
2944                            shrink_start, shrink_last_byte - 1,
2945                            GFP_NOFS);
2946         btrfs_commit_transaction(trans, root);
2947 out:
2948         btrfs_free_path(path);
2949         return ret;
2950 }
2951
2952 int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path,
2953                            struct btrfs_key *key)
2954 {
2955         int ret;
2956         struct btrfs_key found_key;
2957         struct extent_buffer *leaf;
2958         int slot;
2959
2960         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2961         if (ret < 0)
2962                 return ret;
2963         while(1) {
2964                 slot = path->slots[0];
2965                 leaf = path->nodes[0];
2966                 if (slot >= btrfs_header_nritems(leaf)) {
2967                         ret = btrfs_next_leaf(root, path);
2968                         if (ret == 0)
2969                                 continue;
2970                         if (ret < 0)
2971                                 goto error;
2972                         break;
2973                 }
2974                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2975
2976                 if (found_key.objectid >= key->objectid &&
2977                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY)
2978                         return 0;
2979                 path->slots[0]++;
2980         }
2981         ret = -ENOENT;
2982 error:
2983         return ret;
2984 }
2985
2986 int btrfs_read_block_groups(struct btrfs_root *root)
2987 {
2988         struct btrfs_path *path;
2989         int ret;
2990         int bit;
2991         struct btrfs_block_group_cache *cache;
2992         struct btrfs_fs_info *info = root->fs_info;
2993         struct btrfs_space_info *space_info;
2994         struct extent_io_tree *block_group_cache;
2995         struct btrfs_key key;
2996         struct btrfs_key found_key;
2997         struct extent_buffer *leaf;
2998
2999         block_group_cache = &info->block_group_cache;
3000         root = info->extent_root;
3001         key.objectid = 0;
3002         key.offset = 0;
3003         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
3004         path = btrfs_alloc_path();
3005         if (!path)
3006                 return -ENOMEM;
3007
3008         while(1) {
3009                 ret = find_first_block_group(root, path, &key);
3010                 if (ret > 0) {
3011                         ret = 0;
3012                         goto error;
3013                 }
3014                 if (ret != 0)
3015                         goto error;
3016
3017                 leaf = path->nodes[0];
3018                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3019                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
3020                 if (!cache) {
3021                         ret = -ENOMEM;
3022                         break;
3023                 }
3024
3025                 read_extent_buffer(leaf, &cache->item,
3026                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
3027                                    sizeof(cache->item));
3028                 memcpy(&cache->key, &found_key, sizeof(found_key));
3029
3030                 key.objectid = found_key.objectid + found_key.offset;
3031                 btrfs_release_path(root, path);
3032                 cache->flags = btrfs_block_group_flags(&cache->item);
3033                 bit = 0;
3034                 if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
3035                         bit = BLOCK_GROUP_DATA;
3036                 } else if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
3037                         bit = BLOCK_GROUP_SYSTEM;
3038                 } else if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
3039                         bit = BLOCK_GROUP_METADATA;
3040                 }
3041                 set_avail_alloc_bits(info, cache->flags);
3042
3043                 ret = update_space_info(info, cache->flags, found_key.offset,
3044                                         btrfs_block_group_used(&cache->item),
3045                                         &space_info);
3046                 BUG_ON(ret);
3047                 cache->space_info = space_info;
3048
3049                 /* use EXTENT_LOCKED to prevent merging */
3050                 set_extent_bits(block_group_cache, found_key.objectid,
3051                                 found_key.objectid + found_key.offset - 1,
3052                                 bit | EXTENT_LOCKED, GFP_NOFS);
3053                 set_state_private(block_group_cache, found_key.objectid,
3054                                   (unsigned long)cache);
3055
3056                 if (key.objectid >=
3057                     btrfs_super_total_bytes(&info->super_copy))
3058                         break;
3059         }
3060         ret = 0;
3061 error:
3062         btrfs_free_path(path);
3063         return ret;
3064 }
3065
3066 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
3067                            struct btrfs_root *root, u64 bytes_used,
3068                            u64 type, u64 chunk_objectid, u64 chunk_offset,
3069                            u64 size)
3070 {
3071         int ret;
3072         int bit = 0;
3073         struct btrfs_root *extent_root;
3074         struct btrfs_block_group_cache *cache;
3075         struct extent_io_tree *block_group_cache;
3076
3077         extent_root = root->fs_info->extent_root;
3078         block_group_cache = &root->fs_info->block_group_cache;
3079
3080         cache = kzalloc(sizeof(*cache), GFP_NOFS);
3081         BUG_ON(!cache);
3082         cache->key.objectid = chunk_offset;
3083         cache->key.offset = size;
3084
3085         btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
3086         memset(&cache->item, 0, sizeof(cache->item));
3087         btrfs_set_block_group_used(&cache->item, bytes_used);
3088         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
3089         cache->flags = type;
3090         btrfs_set_block_group_flags(&cache->item, type);
3091
3092         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
3093                                 &cache->space_info);
3094         BUG_ON(ret);
3095
3096         bit = block_group_state_bits(type);
3097         set_extent_bits(block_group_cache, chunk_offset,
3098                         chunk_offset + size - 1,
3099                         bit | EXTENT_LOCKED, GFP_NOFS);
3100
3101         set_state_private(block_group_cache, chunk_offset,
3102                           (unsigned long)cache);
3103         ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
3104                                 sizeof(cache->item));
3105         BUG_ON(ret);
3106
3107         finish_current_insert(trans, extent_root);
3108         ret = del_pending_extents(trans, extent_root);
3109         BUG_ON(ret);
3110         set_avail_alloc_bits(extent_root->fs_info, type);
3111         return 0;
3112 }