Btrfs: use a cached state for extent state operations during delalloc
[safe/jmp/linux-2.6] / fs / btrfs / extent_io.c
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/gfp.h>
6 #include <linux/pagemap.h>
7 #include <linux/page-flags.h>
8 #include <linux/module.h>
9 #include <linux/spinlock.h>
10 #include <linux/blkdev.h>
11 #include <linux/swap.h>
12 #include <linux/writeback.h>
13 #include <linux/pagevec.h>
14 #include "extent_io.h"
15 #include "extent_map.h"
16 #include "compat.h"
17 #include "ctree.h"
18 #include "btrfs_inode.h"
19
20 static struct kmem_cache *extent_state_cache;
21 static struct kmem_cache *extent_buffer_cache;
22
23 static LIST_HEAD(buffers);
24 static LIST_HEAD(states);
25
26 #define LEAK_DEBUG 0
27 #if LEAK_DEBUG
28 static DEFINE_SPINLOCK(leak_lock);
29 #endif
30
31 #define BUFFER_LRU_MAX 64
32
33 struct tree_entry {
34         u64 start;
35         u64 end;
36         struct rb_node rb_node;
37 };
38
39 struct extent_page_data {
40         struct bio *bio;
41         struct extent_io_tree *tree;
42         get_extent_t *get_extent;
43
44         /* tells writepage not to lock the state bits for this range
45          * it still does the unlocking
46          */
47         unsigned int extent_locked:1;
48
49         /* tells the submit_bio code to use a WRITE_SYNC */
50         unsigned int sync_io:1;
51 };
52
53 int __init extent_io_init(void)
54 {
55         extent_state_cache = kmem_cache_create("extent_state",
56                         sizeof(struct extent_state), 0,
57                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
58         if (!extent_state_cache)
59                 return -ENOMEM;
60
61         extent_buffer_cache = kmem_cache_create("extent_buffers",
62                         sizeof(struct extent_buffer), 0,
63                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
64         if (!extent_buffer_cache)
65                 goto free_state_cache;
66         return 0;
67
68 free_state_cache:
69         kmem_cache_destroy(extent_state_cache);
70         return -ENOMEM;
71 }
72
73 void extent_io_exit(void)
74 {
75         struct extent_state *state;
76         struct extent_buffer *eb;
77
78         while (!list_empty(&states)) {
79                 state = list_entry(states.next, struct extent_state, leak_list);
80                 printk(KERN_ERR "btrfs state leak: start %llu end %llu "
81                        "state %lu in tree %p refs %d\n",
82                        (unsigned long long)state->start,
83                        (unsigned long long)state->end,
84                        state->state, state->tree, atomic_read(&state->refs));
85                 list_del(&state->leak_list);
86                 kmem_cache_free(extent_state_cache, state);
87
88         }
89
90         while (!list_empty(&buffers)) {
91                 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
92                 printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
93                        "refs %d\n", (unsigned long long)eb->start,
94                        eb->len, atomic_read(&eb->refs));
95                 list_del(&eb->leak_list);
96                 kmem_cache_free(extent_buffer_cache, eb);
97         }
98         if (extent_state_cache)
99                 kmem_cache_destroy(extent_state_cache);
100         if (extent_buffer_cache)
101                 kmem_cache_destroy(extent_buffer_cache);
102 }
103
104 void extent_io_tree_init(struct extent_io_tree *tree,
105                           struct address_space *mapping, gfp_t mask)
106 {
107         tree->state.rb_node = NULL;
108         tree->buffer.rb_node = NULL;
109         tree->ops = NULL;
110         tree->dirty_bytes = 0;
111         spin_lock_init(&tree->lock);
112         spin_lock_init(&tree->buffer_lock);
113         tree->mapping = mapping;
114 }
115
116 static struct extent_state *alloc_extent_state(gfp_t mask)
117 {
118         struct extent_state *state;
119 #if LEAK_DEBUG
120         unsigned long flags;
121 #endif
122
123         state = kmem_cache_alloc(extent_state_cache, mask);
124         if (!state)
125                 return state;
126         state->state = 0;
127         state->private = 0;
128         state->tree = NULL;
129 #if LEAK_DEBUG
130         spin_lock_irqsave(&leak_lock, flags);
131         list_add(&state->leak_list, &states);
132         spin_unlock_irqrestore(&leak_lock, flags);
133 #endif
134         atomic_set(&state->refs, 1);
135         init_waitqueue_head(&state->wq);
136         return state;
137 }
138
139 static void free_extent_state(struct extent_state *state)
140 {
141         if (!state)
142                 return;
143         if (atomic_dec_and_test(&state->refs)) {
144 #if LEAK_DEBUG
145                 unsigned long flags;
146 #endif
147                 WARN_ON(state->tree);
148 #if LEAK_DEBUG
149                 spin_lock_irqsave(&leak_lock, flags);
150                 list_del(&state->leak_list);
151                 spin_unlock_irqrestore(&leak_lock, flags);
152 #endif
153                 kmem_cache_free(extent_state_cache, state);
154         }
155 }
156
157 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
158                                    struct rb_node *node)
159 {
160         struct rb_node **p = &root->rb_node;
161         struct rb_node *parent = NULL;
162         struct tree_entry *entry;
163
164         while (*p) {
165                 parent = *p;
166                 entry = rb_entry(parent, struct tree_entry, rb_node);
167
168                 if (offset < entry->start)
169                         p = &(*p)->rb_left;
170                 else if (offset > entry->end)
171                         p = &(*p)->rb_right;
172                 else
173                         return parent;
174         }
175
176         entry = rb_entry(node, struct tree_entry, rb_node);
177         rb_link_node(node, parent, p);
178         rb_insert_color(node, root);
179         return NULL;
180 }
181
182 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
183                                      struct rb_node **prev_ret,
184                                      struct rb_node **next_ret)
185 {
186         struct rb_root *root = &tree->state;
187         struct rb_node *n = root->rb_node;
188         struct rb_node *prev = NULL;
189         struct rb_node *orig_prev = NULL;
190         struct tree_entry *entry;
191         struct tree_entry *prev_entry = NULL;
192
193         while (n) {
194                 entry = rb_entry(n, struct tree_entry, rb_node);
195                 prev = n;
196                 prev_entry = entry;
197
198                 if (offset < entry->start)
199                         n = n->rb_left;
200                 else if (offset > entry->end)
201                         n = n->rb_right;
202                 else
203                         return n;
204         }
205
206         if (prev_ret) {
207                 orig_prev = prev;
208                 while (prev && offset > prev_entry->end) {
209                         prev = rb_next(prev);
210                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
211                 }
212                 *prev_ret = prev;
213                 prev = orig_prev;
214         }
215
216         if (next_ret) {
217                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
218                 while (prev && offset < prev_entry->start) {
219                         prev = rb_prev(prev);
220                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
221                 }
222                 *next_ret = prev;
223         }
224         return NULL;
225 }
226
227 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
228                                           u64 offset)
229 {
230         struct rb_node *prev = NULL;
231         struct rb_node *ret;
232
233         ret = __etree_search(tree, offset, &prev, NULL);
234         if (!ret)
235                 return prev;
236         return ret;
237 }
238
239 static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
240                                           u64 offset, struct rb_node *node)
241 {
242         struct rb_root *root = &tree->buffer;
243         struct rb_node **p = &root->rb_node;
244         struct rb_node *parent = NULL;
245         struct extent_buffer *eb;
246
247         while (*p) {
248                 parent = *p;
249                 eb = rb_entry(parent, struct extent_buffer, rb_node);
250
251                 if (offset < eb->start)
252                         p = &(*p)->rb_left;
253                 else if (offset > eb->start)
254                         p = &(*p)->rb_right;
255                 else
256                         return eb;
257         }
258
259         rb_link_node(node, parent, p);
260         rb_insert_color(node, root);
261         return NULL;
262 }
263
264 static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
265                                            u64 offset)
266 {
267         struct rb_root *root = &tree->buffer;
268         struct rb_node *n = root->rb_node;
269         struct extent_buffer *eb;
270
271         while (n) {
272                 eb = rb_entry(n, struct extent_buffer, rb_node);
273                 if (offset < eb->start)
274                         n = n->rb_left;
275                 else if (offset > eb->start)
276                         n = n->rb_right;
277                 else
278                         return eb;
279         }
280         return NULL;
281 }
282
283 /*
284  * utility function to look for merge candidates inside a given range.
285  * Any extents with matching state are merged together into a single
286  * extent in the tree.  Extents with EXTENT_IO in their state field
287  * are not merged because the end_io handlers need to be able to do
288  * operations on them without sleeping (or doing allocations/splits).
289  *
290  * This should be called with the tree lock held.
291  */
292 static int merge_state(struct extent_io_tree *tree,
293                        struct extent_state *state)
294 {
295         struct extent_state *other;
296         struct rb_node *other_node;
297
298         if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
299                 return 0;
300
301         other_node = rb_prev(&state->rb_node);
302         if (other_node) {
303                 other = rb_entry(other_node, struct extent_state, rb_node);
304                 if (other->end == state->start - 1 &&
305                     other->state == state->state) {
306                         state->start = other->start;
307                         other->tree = NULL;
308                         rb_erase(&other->rb_node, &tree->state);
309                         free_extent_state(other);
310                 }
311         }
312         other_node = rb_next(&state->rb_node);
313         if (other_node) {
314                 other = rb_entry(other_node, struct extent_state, rb_node);
315                 if (other->start == state->end + 1 &&
316                     other->state == state->state) {
317                         other->start = state->start;
318                         state->tree = NULL;
319                         rb_erase(&state->rb_node, &tree->state);
320                         free_extent_state(state);
321                 }
322         }
323         return 0;
324 }
325
326 static void set_state_cb(struct extent_io_tree *tree,
327                          struct extent_state *state,
328                          unsigned long bits)
329 {
330         if (tree->ops && tree->ops->set_bit_hook) {
331                 tree->ops->set_bit_hook(tree->mapping->host, state->start,
332                                         state->end, state->state, bits);
333         }
334 }
335
336 static void clear_state_cb(struct extent_io_tree *tree,
337                            struct extent_state *state,
338                            unsigned long bits)
339 {
340         if (tree->ops && tree->ops->clear_bit_hook) {
341                 tree->ops->clear_bit_hook(tree->mapping->host, state->start,
342                                           state->end, state->state, bits);
343         }
344 }
345
346 /*
347  * insert an extent_state struct into the tree.  'bits' are set on the
348  * struct before it is inserted.
349  *
350  * This may return -EEXIST if the extent is already there, in which case the
351  * state struct is freed.
352  *
353  * The tree lock is not taken internally.  This is a utility function and
354  * probably isn't what you want to call (see set/clear_extent_bit).
355  */
356 static int insert_state(struct extent_io_tree *tree,
357                         struct extent_state *state, u64 start, u64 end,
358                         int bits)
359 {
360         struct rb_node *node;
361
362         if (end < start) {
363                 printk(KERN_ERR "btrfs end < start %llu %llu\n",
364                        (unsigned long long)end,
365                        (unsigned long long)start);
366                 WARN_ON(1);
367         }
368         if (bits & EXTENT_DIRTY)
369                 tree->dirty_bytes += end - start + 1;
370         state->start = start;
371         state->end = end;
372         set_state_cb(tree, state, bits);
373         state->state |= bits;
374         node = tree_insert(&tree->state, end, &state->rb_node);
375         if (node) {
376                 struct extent_state *found;
377                 found = rb_entry(node, struct extent_state, rb_node);
378                 printk(KERN_ERR "btrfs found node %llu %llu on insert of "
379                        "%llu %llu\n", (unsigned long long)found->start,
380                        (unsigned long long)found->end,
381                        (unsigned long long)start, (unsigned long long)end);
382                 free_extent_state(state);
383                 return -EEXIST;
384         }
385         state->tree = tree;
386         merge_state(tree, state);
387         return 0;
388 }
389
390 /*
391  * split a given extent state struct in two, inserting the preallocated
392  * struct 'prealloc' as the newly created second half.  'split' indicates an
393  * offset inside 'orig' where it should be split.
394  *
395  * Before calling,
396  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
397  * are two extent state structs in the tree:
398  * prealloc: [orig->start, split - 1]
399  * orig: [ split, orig->end ]
400  *
401  * The tree locks are not taken by this function. They need to be held
402  * by the caller.
403  */
404 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
405                        struct extent_state *prealloc, u64 split)
406 {
407         struct rb_node *node;
408         prealloc->start = orig->start;
409         prealloc->end = split - 1;
410         prealloc->state = orig->state;
411         orig->start = split;
412
413         node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
414         if (node) {
415                 free_extent_state(prealloc);
416                 return -EEXIST;
417         }
418         prealloc->tree = tree;
419         return 0;
420 }
421
422 /*
423  * utility function to clear some bits in an extent state struct.
424  * it will optionally wake up any one waiting on this state (wake == 1), or
425  * forcibly remove the state from the tree (delete == 1).
426  *
427  * If no bits are set on the state struct after clearing things, the
428  * struct is freed and removed from the tree
429  */
430 static int clear_state_bit(struct extent_io_tree *tree,
431                             struct extent_state *state, int bits, int wake,
432                             int delete)
433 {
434         int ret = state->state & bits;
435
436         if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
437                 u64 range = state->end - state->start + 1;
438                 WARN_ON(range > tree->dirty_bytes);
439                 tree->dirty_bytes -= range;
440         }
441         clear_state_cb(tree, state, bits);
442         state->state &= ~bits;
443         if (wake)
444                 wake_up(&state->wq);
445         if (delete || state->state == 0) {
446                 if (state->tree) {
447                         clear_state_cb(tree, state, state->state);
448                         rb_erase(&state->rb_node, &tree->state);
449                         state->tree = NULL;
450                         free_extent_state(state);
451                 } else {
452                         WARN_ON(1);
453                 }
454         } else {
455                 merge_state(tree, state);
456         }
457         return ret;
458 }
459
460 /*
461  * clear some bits on a range in the tree.  This may require splitting
462  * or inserting elements in the tree, so the gfp mask is used to
463  * indicate which allocations or sleeping are allowed.
464  *
465  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
466  * the given range from the tree regardless of state (ie for truncate).
467  *
468  * the range [start, end] is inclusive.
469  *
470  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
471  * bits were already set, or zero if none of the bits were already set.
472  */
473 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
474                      int bits, int wake, int delete,
475                      struct extent_state **cached_state,
476                      gfp_t mask)
477 {
478         struct extent_state *state;
479         struct extent_state *cached;
480         struct extent_state *prealloc = NULL;
481         struct rb_node *next_node;
482         struct rb_node *node;
483         u64 last_end;
484         int err;
485         int set = 0;
486
487 again:
488         if (!prealloc && (mask & __GFP_WAIT)) {
489                 prealloc = alloc_extent_state(mask);
490                 if (!prealloc)
491                         return -ENOMEM;
492         }
493
494         spin_lock(&tree->lock);
495         if (cached_state) {
496                 cached = *cached_state;
497                 *cached_state = NULL;
498                 if (cached->tree && cached->start == start) {
499                         atomic_dec(&cached->refs);
500                         state = cached;
501                         last_end = state->end;
502                         goto found;
503                 }
504                 free_extent_state(cached);
505         }
506         /*
507          * this search will find the extents that end after
508          * our range starts
509          */
510         node = tree_search(tree, start);
511         if (!node)
512                 goto out;
513         state = rb_entry(node, struct extent_state, rb_node);
514 hit_next:
515         if (state->start > end)
516                 goto out;
517         WARN_ON(state->end < start);
518         last_end = state->end;
519
520         /*
521          *     | ---- desired range ---- |
522          *  | state | or
523          *  | ------------- state -------------- |
524          *
525          * We need to split the extent we found, and may flip
526          * bits on second half.
527          *
528          * If the extent we found extends past our range, we
529          * just split and search again.  It'll get split again
530          * the next time though.
531          *
532          * If the extent we found is inside our range, we clear
533          * the desired bit on it.
534          */
535
536         if (state->start < start) {
537                 if (!prealloc)
538                         prealloc = alloc_extent_state(GFP_ATOMIC);
539                 err = split_state(tree, state, prealloc, start);
540                 BUG_ON(err == -EEXIST);
541                 prealloc = NULL;
542                 if (err)
543                         goto out;
544                 if (state->end <= end) {
545                         set |= clear_state_bit(tree, state, bits,
546                                         wake, delete);
547                         if (last_end == (u64)-1)
548                                 goto out;
549                         start = last_end + 1;
550                 } else {
551                         start = state->start;
552                 }
553                 goto search_again;
554         }
555         /*
556          * | ---- desired range ---- |
557          *                        | state |
558          * We need to split the extent, and clear the bit
559          * on the first half
560          */
561         if (state->start <= end && state->end > end) {
562                 if (!prealloc)
563                         prealloc = alloc_extent_state(GFP_ATOMIC);
564                 err = split_state(tree, state, prealloc, end + 1);
565                 BUG_ON(err == -EEXIST);
566
567                 if (wake)
568                         wake_up(&state->wq);
569                 set |= clear_state_bit(tree, prealloc, bits,
570                                        wake, delete);
571                 prealloc = NULL;
572                 goto out;
573         }
574 found:
575         if (state->end < end && prealloc && !need_resched())
576                 next_node = rb_next(&state->rb_node);
577         else
578                 next_node = NULL;
579         set |= clear_state_bit(tree, state, bits, wake, delete);
580         if (last_end == (u64)-1)
581                 goto out;
582         start = last_end + 1;
583         if (start <= end && next_node) {
584                 state = rb_entry(next_node, struct extent_state,
585                                  rb_node);
586                 if (state->start == start)
587                         goto hit_next;
588         }
589         goto search_again;
590
591 out:
592         spin_unlock(&tree->lock);
593         if (prealloc)
594                 free_extent_state(prealloc);
595
596         return set;
597
598 search_again:
599         if (start > end)
600                 goto out;
601         spin_unlock(&tree->lock);
602         if (mask & __GFP_WAIT)
603                 cond_resched();
604         goto again;
605 }
606
607 static int wait_on_state(struct extent_io_tree *tree,
608                          struct extent_state *state)
609                 __releases(tree->lock)
610                 __acquires(tree->lock)
611 {
612         DEFINE_WAIT(wait);
613         prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
614         spin_unlock(&tree->lock);
615         schedule();
616         spin_lock(&tree->lock);
617         finish_wait(&state->wq, &wait);
618         return 0;
619 }
620
621 /*
622  * waits for one or more bits to clear on a range in the state tree.
623  * The range [start, end] is inclusive.
624  * The tree lock is taken by this function
625  */
626 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
627 {
628         struct extent_state *state;
629         struct rb_node *node;
630
631         spin_lock(&tree->lock);
632 again:
633         while (1) {
634                 /*
635                  * this search will find all the extents that end after
636                  * our range starts
637                  */
638                 node = tree_search(tree, start);
639                 if (!node)
640                         break;
641
642                 state = rb_entry(node, struct extent_state, rb_node);
643
644                 if (state->start > end)
645                         goto out;
646
647                 if (state->state & bits) {
648                         start = state->start;
649                         atomic_inc(&state->refs);
650                         wait_on_state(tree, state);
651                         free_extent_state(state);
652                         goto again;
653                 }
654                 start = state->end + 1;
655
656                 if (start > end)
657                         break;
658
659                 if (need_resched()) {
660                         spin_unlock(&tree->lock);
661                         cond_resched();
662                         spin_lock(&tree->lock);
663                 }
664         }
665 out:
666         spin_unlock(&tree->lock);
667         return 0;
668 }
669
670 static void set_state_bits(struct extent_io_tree *tree,
671                            struct extent_state *state,
672                            int bits)
673 {
674         if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
675                 u64 range = state->end - state->start + 1;
676                 tree->dirty_bytes += range;
677         }
678         set_state_cb(tree, state, bits);
679         state->state |= bits;
680 }
681
682 static void cache_state(struct extent_state *state,
683                         struct extent_state **cached_ptr)
684 {
685         if (cached_ptr && !(*cached_ptr)) {
686                 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
687                         *cached_ptr = state;
688                         atomic_inc(&state->refs);
689                 }
690         }
691 }
692
693 /*
694  * set some bits on a range in the tree.  This may require allocations or
695  * sleeping, so the gfp mask is used to indicate what is allowed.
696  *
697  * If any of the exclusive bits are set, this will fail with -EEXIST if some
698  * part of the range already has the desired bits set.  The start of the
699  * existing range is returned in failed_start in this case.
700  *
701  * [start, end] is inclusive This takes the tree lock.
702  */
703
704 static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
705                           int bits, int exclusive_bits, u64 *failed_start,
706                           struct extent_state **cached_state,
707                           gfp_t mask)
708 {
709         struct extent_state *state;
710         struct extent_state *prealloc = NULL;
711         struct rb_node *node;
712         int err = 0;
713         u64 last_start;
714         u64 last_end;
715 again:
716         if (!prealloc && (mask & __GFP_WAIT)) {
717                 prealloc = alloc_extent_state(mask);
718                 if (!prealloc)
719                         return -ENOMEM;
720         }
721
722         spin_lock(&tree->lock);
723         if (cached_state && *cached_state) {
724                 state = *cached_state;
725                 if (state->start == start && state->tree) {
726                         node = &state->rb_node;
727                         goto hit_next;
728                 }
729         }
730         /*
731          * this search will find all the extents that end after
732          * our range starts.
733          */
734         node = tree_search(tree, start);
735         if (!node) {
736                 err = insert_state(tree, prealloc, start, end, bits);
737                 prealloc = NULL;
738                 BUG_ON(err == -EEXIST);
739                 goto out;
740         }
741         state = rb_entry(node, struct extent_state, rb_node);
742 hit_next:
743         last_start = state->start;
744         last_end = state->end;
745
746         /*
747          * | ---- desired range ---- |
748          * | state |
749          *
750          * Just lock what we found and keep going
751          */
752         if (state->start == start && state->end <= end) {
753                 struct rb_node *next_node;
754                 if (state->state & exclusive_bits) {
755                         *failed_start = state->start;
756                         err = -EEXIST;
757                         goto out;
758                 }
759                 set_state_bits(tree, state, bits);
760                 cache_state(state, cached_state);
761                 merge_state(tree, state);
762                 if (last_end == (u64)-1)
763                         goto out;
764
765                 start = last_end + 1;
766                 if (start < end && prealloc && !need_resched()) {
767                         next_node = rb_next(node);
768                         if (next_node) {
769                                 state = rb_entry(next_node, struct extent_state,
770                                                  rb_node);
771                                 if (state->start == start)
772                                         goto hit_next;
773                         }
774                 }
775                 goto search_again;
776         }
777
778         /*
779          *     | ---- desired range ---- |
780          * | state |
781          *   or
782          * | ------------- state -------------- |
783          *
784          * We need to split the extent we found, and may flip bits on
785          * second half.
786          *
787          * If the extent we found extends past our
788          * range, we just split and search again.  It'll get split
789          * again the next time though.
790          *
791          * If the extent we found is inside our range, we set the
792          * desired bit on it.
793          */
794         if (state->start < start) {
795                 if (state->state & exclusive_bits) {
796                         *failed_start = start;
797                         err = -EEXIST;
798                         goto out;
799                 }
800                 err = split_state(tree, state, prealloc, start);
801                 BUG_ON(err == -EEXIST);
802                 prealloc = NULL;
803                 if (err)
804                         goto out;
805                 if (state->end <= end) {
806                         set_state_bits(tree, state, bits);
807                         cache_state(state, cached_state);
808                         merge_state(tree, state);
809                         if (last_end == (u64)-1)
810                                 goto out;
811                         start = last_end + 1;
812                 } else {
813                         start = state->start;
814                 }
815                 goto search_again;
816         }
817         /*
818          * | ---- desired range ---- |
819          *     | state | or               | state |
820          *
821          * There's a hole, we need to insert something in it and
822          * ignore the extent we found.
823          */
824         if (state->start > start) {
825                 u64 this_end;
826                 if (end < last_start)
827                         this_end = end;
828                 else
829                         this_end = last_start - 1;
830                 err = insert_state(tree, prealloc, start, this_end,
831                                    bits);
832                 cache_state(prealloc, cached_state);
833                 prealloc = NULL;
834                 BUG_ON(err == -EEXIST);
835                 if (err)
836                         goto out;
837                 start = this_end + 1;
838                 goto search_again;
839         }
840         /*
841          * | ---- desired range ---- |
842          *                        | state |
843          * We need to split the extent, and set the bit
844          * on the first half
845          */
846         if (state->start <= end && state->end > end) {
847                 if (state->state & exclusive_bits) {
848                         *failed_start = start;
849                         err = -EEXIST;
850                         goto out;
851                 }
852                 err = split_state(tree, state, prealloc, end + 1);
853                 BUG_ON(err == -EEXIST);
854
855                 set_state_bits(tree, prealloc, bits);
856                 cache_state(prealloc, cached_state);
857                 merge_state(tree, prealloc);
858                 prealloc = NULL;
859                 goto out;
860         }
861
862         goto search_again;
863
864 out:
865         spin_unlock(&tree->lock);
866         if (prealloc)
867                 free_extent_state(prealloc);
868
869         return err;
870
871 search_again:
872         if (start > end)
873                 goto out;
874         spin_unlock(&tree->lock);
875         if (mask & __GFP_WAIT)
876                 cond_resched();
877         goto again;
878 }
879
880 /* wrappers around set/clear extent bit */
881 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
882                      gfp_t mask)
883 {
884         return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
885                               NULL, mask);
886 }
887
888 int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
889                        gfp_t mask)
890 {
891         return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, NULL,
892                               mask);
893 }
894
895 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
896                     int bits, gfp_t mask)
897 {
898         return set_extent_bit(tree, start, end, bits, 0, NULL,
899                               NULL, mask);
900 }
901
902 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
903                       int bits, gfp_t mask)
904 {
905         return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
906 }
907
908 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
909                      gfp_t mask)
910 {
911         return set_extent_bit(tree, start, end,
912                               EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
913                               0, NULL, NULL, mask);
914 }
915
916 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
917                        gfp_t mask)
918 {
919         return clear_extent_bit(tree, start, end,
920                                 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
921                                 NULL, mask);
922 }
923
924 int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
925                          gfp_t mask)
926 {
927         return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0,
928                                 NULL, mask);
929 }
930
931 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
932                      gfp_t mask)
933 {
934         return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
935                               NULL, mask);
936 }
937
938 static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
939                        gfp_t mask)
940 {
941         return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
942                                 NULL, mask);
943 }
944
945 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
946                         gfp_t mask)
947 {
948         return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
949                               NULL, mask);
950 }
951
952 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
953                                  u64 end, gfp_t mask)
954 {
955         return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
956                                 NULL, mask);
957 }
958
959 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
960 {
961         return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
962 }
963
964 /*
965  * either insert or lock state struct between start and end use mask to tell
966  * us if waiting is desired.
967  */
968 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
969                      int bits, struct extent_state **cached_state, gfp_t mask)
970 {
971         int err;
972         u64 failed_start;
973         while (1) {
974                 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
975                                      EXTENT_LOCKED, &failed_start,
976                                      cached_state, mask);
977                 if (err == -EEXIST && (mask & __GFP_WAIT)) {
978                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
979                         start = failed_start;
980                 } else {
981                         break;
982                 }
983                 WARN_ON(start > end);
984         }
985         return err;
986 }
987
988 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
989 {
990         return lock_extent_bits(tree, start, end, 0, NULL, mask);
991 }
992
993 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
994                     gfp_t mask)
995 {
996         int err;
997         u64 failed_start;
998
999         err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1000                              &failed_start, NULL, mask);
1001         if (err == -EEXIST) {
1002                 if (failed_start > start)
1003                         clear_extent_bit(tree, start, failed_start - 1,
1004                                          EXTENT_LOCKED, 1, 0, NULL, mask);
1005                 return 0;
1006         }
1007         return 1;
1008 }
1009
1010 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1011                          struct extent_state **cached, gfp_t mask)
1012 {
1013         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1014                                 mask);
1015 }
1016
1017 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1018                   gfp_t mask)
1019 {
1020         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1021                                 mask);
1022 }
1023
1024 /*
1025  * helper function to set pages and extents in the tree dirty
1026  */
1027 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
1028 {
1029         unsigned long index = start >> PAGE_CACHE_SHIFT;
1030         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1031         struct page *page;
1032
1033         while (index <= end_index) {
1034                 page = find_get_page(tree->mapping, index);
1035                 BUG_ON(!page);
1036                 __set_page_dirty_nobuffers(page);
1037                 page_cache_release(page);
1038                 index++;
1039         }
1040         return 0;
1041 }
1042
1043 /*
1044  * helper function to set both pages and extents in the tree writeback
1045  */
1046 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1047 {
1048         unsigned long index = start >> PAGE_CACHE_SHIFT;
1049         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1050         struct page *page;
1051
1052         while (index <= end_index) {
1053                 page = find_get_page(tree->mapping, index);
1054                 BUG_ON(!page);
1055                 set_page_writeback(page);
1056                 page_cache_release(page);
1057                 index++;
1058         }
1059         return 0;
1060 }
1061
1062 /*
1063  * find the first offset in the io tree with 'bits' set. zero is
1064  * returned if we find something, and *start_ret and *end_ret are
1065  * set to reflect the state struct that was found.
1066  *
1067  * If nothing was found, 1 is returned, < 0 on error
1068  */
1069 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1070                           u64 *start_ret, u64 *end_ret, int bits)
1071 {
1072         struct rb_node *node;
1073         struct extent_state *state;
1074         int ret = 1;
1075
1076         spin_lock(&tree->lock);
1077         /*
1078          * this search will find all the extents that end after
1079          * our range starts.
1080          */
1081         node = tree_search(tree, start);
1082         if (!node)
1083                 goto out;
1084
1085         while (1) {
1086                 state = rb_entry(node, struct extent_state, rb_node);
1087                 if (state->end >= start && (state->state & bits)) {
1088                         *start_ret = state->start;
1089                         *end_ret = state->end;
1090                         ret = 0;
1091                         break;
1092                 }
1093                 node = rb_next(node);
1094                 if (!node)
1095                         break;
1096         }
1097 out:
1098         spin_unlock(&tree->lock);
1099         return ret;
1100 }
1101
1102 /* find the first state struct with 'bits' set after 'start', and
1103  * return it.  tree->lock must be held.  NULL will returned if
1104  * nothing was found after 'start'
1105  */
1106 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1107                                                  u64 start, int bits)
1108 {
1109         struct rb_node *node;
1110         struct extent_state *state;
1111
1112         /*
1113          * this search will find all the extents that end after
1114          * our range starts.
1115          */
1116         node = tree_search(tree, start);
1117         if (!node)
1118                 goto out;
1119
1120         while (1) {
1121                 state = rb_entry(node, struct extent_state, rb_node);
1122                 if (state->end >= start && (state->state & bits))
1123                         return state;
1124
1125                 node = rb_next(node);
1126                 if (!node)
1127                         break;
1128         }
1129 out:
1130         return NULL;
1131 }
1132
1133 /*
1134  * find a contiguous range of bytes in the file marked as delalloc, not
1135  * more than 'max_bytes'.  start and end are used to return the range,
1136  *
1137  * 1 is returned if we find something, 0 if nothing was in the tree
1138  */
1139 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1140                                         u64 *start, u64 *end, u64 max_bytes)
1141 {
1142         struct rb_node *node;
1143         struct extent_state *state;
1144         u64 cur_start = *start;
1145         u64 found = 0;
1146         u64 total_bytes = 0;
1147
1148         spin_lock(&tree->lock);
1149
1150         /*
1151          * this search will find all the extents that end after
1152          * our range starts.
1153          */
1154         node = tree_search(tree, cur_start);
1155         if (!node) {
1156                 if (!found)
1157                         *end = (u64)-1;
1158                 goto out;
1159         }
1160
1161         while (1) {
1162                 state = rb_entry(node, struct extent_state, rb_node);
1163                 if (found && (state->start != cur_start ||
1164                               (state->state & EXTENT_BOUNDARY))) {
1165                         goto out;
1166                 }
1167                 if (!(state->state & EXTENT_DELALLOC)) {
1168                         if (!found)
1169                                 *end = state->end;
1170                         goto out;
1171                 }
1172                 if (!found)
1173                         *start = state->start;
1174                 found++;
1175                 *end = state->end;
1176                 cur_start = state->end + 1;
1177                 node = rb_next(node);
1178                 if (!node)
1179                         break;
1180                 total_bytes += state->end - state->start + 1;
1181                 if (total_bytes >= max_bytes)
1182                         break;
1183         }
1184 out:
1185         spin_unlock(&tree->lock);
1186         return found;
1187 }
1188
1189 static noinline int __unlock_for_delalloc(struct inode *inode,
1190                                           struct page *locked_page,
1191                                           u64 start, u64 end)
1192 {
1193         int ret;
1194         struct page *pages[16];
1195         unsigned long index = start >> PAGE_CACHE_SHIFT;
1196         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1197         unsigned long nr_pages = end_index - index + 1;
1198         int i;
1199
1200         if (index == locked_page->index && end_index == index)
1201                 return 0;
1202
1203         while (nr_pages > 0) {
1204                 ret = find_get_pages_contig(inode->i_mapping, index,
1205                                      min_t(unsigned long, nr_pages,
1206                                      ARRAY_SIZE(pages)), pages);
1207                 for (i = 0; i < ret; i++) {
1208                         if (pages[i] != locked_page)
1209                                 unlock_page(pages[i]);
1210                         page_cache_release(pages[i]);
1211                 }
1212                 nr_pages -= ret;
1213                 index += ret;
1214                 cond_resched();
1215         }
1216         return 0;
1217 }
1218
1219 static noinline int lock_delalloc_pages(struct inode *inode,
1220                                         struct page *locked_page,
1221                                         u64 delalloc_start,
1222                                         u64 delalloc_end)
1223 {
1224         unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1225         unsigned long start_index = index;
1226         unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1227         unsigned long pages_locked = 0;
1228         struct page *pages[16];
1229         unsigned long nrpages;
1230         int ret;
1231         int i;
1232
1233         /* the caller is responsible for locking the start index */
1234         if (index == locked_page->index && index == end_index)
1235                 return 0;
1236
1237         /* skip the page at the start index */
1238         nrpages = end_index - index + 1;
1239         while (nrpages > 0) {
1240                 ret = find_get_pages_contig(inode->i_mapping, index,
1241                                      min_t(unsigned long,
1242                                      nrpages, ARRAY_SIZE(pages)), pages);
1243                 if (ret == 0) {
1244                         ret = -EAGAIN;
1245                         goto done;
1246                 }
1247                 /* now we have an array of pages, lock them all */
1248                 for (i = 0; i < ret; i++) {
1249                         /*
1250                          * the caller is taking responsibility for
1251                          * locked_page
1252                          */
1253                         if (pages[i] != locked_page) {
1254                                 lock_page(pages[i]);
1255                                 if (!PageDirty(pages[i]) ||
1256                                     pages[i]->mapping != inode->i_mapping) {
1257                                         ret = -EAGAIN;
1258                                         unlock_page(pages[i]);
1259                                         page_cache_release(pages[i]);
1260                                         goto done;
1261                                 }
1262                         }
1263                         page_cache_release(pages[i]);
1264                         pages_locked++;
1265                 }
1266                 nrpages -= ret;
1267                 index += ret;
1268                 cond_resched();
1269         }
1270         ret = 0;
1271 done:
1272         if (ret && pages_locked) {
1273                 __unlock_for_delalloc(inode, locked_page,
1274                               delalloc_start,
1275                               ((u64)(start_index + pages_locked - 1)) <<
1276                               PAGE_CACHE_SHIFT);
1277         }
1278         return ret;
1279 }
1280
1281 /*
1282  * find a contiguous range of bytes in the file marked as delalloc, not
1283  * more than 'max_bytes'.  start and end are used to return the range,
1284  *
1285  * 1 is returned if we find something, 0 if nothing was in the tree
1286  */
1287 static noinline u64 find_lock_delalloc_range(struct inode *inode,
1288                                              struct extent_io_tree *tree,
1289                                              struct page *locked_page,
1290                                              u64 *start, u64 *end,
1291                                              u64 max_bytes)
1292 {
1293         u64 delalloc_start;
1294         u64 delalloc_end;
1295         u64 found;
1296         struct extent_state *cached_state = NULL;
1297         int ret;
1298         int loops = 0;
1299
1300 again:
1301         /* step one, find a bunch of delalloc bytes starting at start */
1302         delalloc_start = *start;
1303         delalloc_end = 0;
1304         found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1305                                     max_bytes);
1306         if (!found || delalloc_end <= *start) {
1307                 *start = delalloc_start;
1308                 *end = delalloc_end;
1309                 return found;
1310         }
1311
1312         /*
1313          * start comes from the offset of locked_page.  We have to lock
1314          * pages in order, so we can't process delalloc bytes before
1315          * locked_page
1316          */
1317         if (delalloc_start < *start)
1318                 delalloc_start = *start;
1319
1320         /*
1321          * make sure to limit the number of pages we try to lock down
1322          * if we're looping.
1323          */
1324         if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1325                 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1326
1327         /* step two, lock all the pages after the page that has start */
1328         ret = lock_delalloc_pages(inode, locked_page,
1329                                   delalloc_start, delalloc_end);
1330         if (ret == -EAGAIN) {
1331                 /* some of the pages are gone, lets avoid looping by
1332                  * shortening the size of the delalloc range we're searching
1333                  */
1334                 free_extent_state(cached_state);
1335                 if (!loops) {
1336                         unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1337                         max_bytes = PAGE_CACHE_SIZE - offset;
1338                         loops = 1;
1339                         goto again;
1340                 } else {
1341                         found = 0;
1342                         goto out_failed;
1343                 }
1344         }
1345         BUG_ON(ret);
1346
1347         /* step three, lock the state bits for the whole range */
1348         lock_extent_bits(tree, delalloc_start, delalloc_end,
1349                          0, &cached_state, GFP_NOFS);
1350
1351         /* then test to make sure it is all still delalloc */
1352         ret = test_range_bit(tree, delalloc_start, delalloc_end,
1353                              EXTENT_DELALLOC, 1, cached_state);
1354         if (!ret) {
1355                 unlock_extent_cached(tree, delalloc_start, delalloc_end,
1356                                      &cached_state, GFP_NOFS);
1357                 __unlock_for_delalloc(inode, locked_page,
1358                               delalloc_start, delalloc_end);
1359                 cond_resched();
1360                 goto again;
1361         }
1362         free_extent_state(cached_state);
1363         *start = delalloc_start;
1364         *end = delalloc_end;
1365 out_failed:
1366         return found;
1367 }
1368
1369 int extent_clear_unlock_delalloc(struct inode *inode,
1370                                 struct extent_io_tree *tree,
1371                                 u64 start, u64 end, struct page *locked_page,
1372                                 int unlock_pages,
1373                                 int clear_unlock,
1374                                 int clear_delalloc, int clear_dirty,
1375                                 int set_writeback,
1376                                 int end_writeback)
1377 {
1378         int ret;
1379         struct page *pages[16];
1380         unsigned long index = start >> PAGE_CACHE_SHIFT;
1381         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1382         unsigned long nr_pages = end_index - index + 1;
1383         int i;
1384         int clear_bits = 0;
1385
1386         if (clear_unlock)
1387                 clear_bits |= EXTENT_LOCKED;
1388         if (clear_dirty)
1389                 clear_bits |= EXTENT_DIRTY;
1390
1391         if (clear_delalloc)
1392                 clear_bits |= EXTENT_DELALLOC;
1393
1394         clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1395         if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
1396                 return 0;
1397
1398         while (nr_pages > 0) {
1399                 ret = find_get_pages_contig(inode->i_mapping, index,
1400                                      min_t(unsigned long,
1401                                      nr_pages, ARRAY_SIZE(pages)), pages);
1402                 for (i = 0; i < ret; i++) {
1403                         if (pages[i] == locked_page) {
1404                                 page_cache_release(pages[i]);
1405                                 continue;
1406                         }
1407                         if (clear_dirty)
1408                                 clear_page_dirty_for_io(pages[i]);
1409                         if (set_writeback)
1410                                 set_page_writeback(pages[i]);
1411                         if (end_writeback)
1412                                 end_page_writeback(pages[i]);
1413                         if (unlock_pages)
1414                                 unlock_page(pages[i]);
1415                         page_cache_release(pages[i]);
1416                 }
1417                 nr_pages -= ret;
1418                 index += ret;
1419                 cond_resched();
1420         }
1421         return 0;
1422 }
1423
1424 /*
1425  * count the number of bytes in the tree that have a given bit(s)
1426  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1427  * cached.  The total number found is returned.
1428  */
1429 u64 count_range_bits(struct extent_io_tree *tree,
1430                      u64 *start, u64 search_end, u64 max_bytes,
1431                      unsigned long bits)
1432 {
1433         struct rb_node *node;
1434         struct extent_state *state;
1435         u64 cur_start = *start;
1436         u64 total_bytes = 0;
1437         int found = 0;
1438
1439         if (search_end <= cur_start) {
1440                 WARN_ON(1);
1441                 return 0;
1442         }
1443
1444         spin_lock(&tree->lock);
1445         if (cur_start == 0 && bits == EXTENT_DIRTY) {
1446                 total_bytes = tree->dirty_bytes;
1447                 goto out;
1448         }
1449         /*
1450          * this search will find all the extents that end after
1451          * our range starts.
1452          */
1453         node = tree_search(tree, cur_start);
1454         if (!node)
1455                 goto out;
1456
1457         while (1) {
1458                 state = rb_entry(node, struct extent_state, rb_node);
1459                 if (state->start > search_end)
1460                         break;
1461                 if (state->end >= cur_start && (state->state & bits)) {
1462                         total_bytes += min(search_end, state->end) + 1 -
1463                                        max(cur_start, state->start);
1464                         if (total_bytes >= max_bytes)
1465                                 break;
1466                         if (!found) {
1467                                 *start = state->start;
1468                                 found = 1;
1469                         }
1470                 }
1471                 node = rb_next(node);
1472                 if (!node)
1473                         break;
1474         }
1475 out:
1476         spin_unlock(&tree->lock);
1477         return total_bytes;
1478 }
1479
1480 /*
1481  * set the private field for a given byte offset in the tree.  If there isn't
1482  * an extent_state there already, this does nothing.
1483  */
1484 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1485 {
1486         struct rb_node *node;
1487         struct extent_state *state;
1488         int ret = 0;
1489
1490         spin_lock(&tree->lock);
1491         /*
1492          * this search will find all the extents that end after
1493          * our range starts.
1494          */
1495         node = tree_search(tree, start);
1496         if (!node) {
1497                 ret = -ENOENT;
1498                 goto out;
1499         }
1500         state = rb_entry(node, struct extent_state, rb_node);
1501         if (state->start != start) {
1502                 ret = -ENOENT;
1503                 goto out;
1504         }
1505         state->private = private;
1506 out:
1507         spin_unlock(&tree->lock);
1508         return ret;
1509 }
1510
1511 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1512 {
1513         struct rb_node *node;
1514         struct extent_state *state;
1515         int ret = 0;
1516
1517         spin_lock(&tree->lock);
1518         /*
1519          * this search will find all the extents that end after
1520          * our range starts.
1521          */
1522         node = tree_search(tree, start);
1523         if (!node) {
1524                 ret = -ENOENT;
1525                 goto out;
1526         }
1527         state = rb_entry(node, struct extent_state, rb_node);
1528         if (state->start != start) {
1529                 ret = -ENOENT;
1530                 goto out;
1531         }
1532         *private = state->private;
1533 out:
1534         spin_unlock(&tree->lock);
1535         return ret;
1536 }
1537
1538 /*
1539  * searches a range in the state tree for a given mask.
1540  * If 'filled' == 1, this returns 1 only if every extent in the tree
1541  * has the bits set.  Otherwise, 1 is returned if any bit in the
1542  * range is found set.
1543  */
1544 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1545                    int bits, int filled, struct extent_state *cached)
1546 {
1547         struct extent_state *state = NULL;
1548         struct rb_node *node;
1549         int bitset = 0;
1550
1551         spin_lock(&tree->lock);
1552         if (cached && cached->tree && cached->start == start)
1553                 node = &cached->rb_node;
1554         else
1555                 node = tree_search(tree, start);
1556         while (node && start <= end) {
1557                 state = rb_entry(node, struct extent_state, rb_node);
1558
1559                 if (filled && state->start > start) {
1560                         bitset = 0;
1561                         break;
1562                 }
1563
1564                 if (state->start > end)
1565                         break;
1566
1567                 if (state->state & bits) {
1568                         bitset = 1;
1569                         if (!filled)
1570                                 break;
1571                 } else if (filled) {
1572                         bitset = 0;
1573                         break;
1574                 }
1575                 start = state->end + 1;
1576                 if (start > end)
1577                         break;
1578                 node = rb_next(node);
1579                 if (!node) {
1580                         if (filled)
1581                                 bitset = 0;
1582                         break;
1583                 }
1584         }
1585         spin_unlock(&tree->lock);
1586         return bitset;
1587 }
1588
1589 /*
1590  * helper function to set a given page up to date if all the
1591  * extents in the tree for that page are up to date
1592  */
1593 static int check_page_uptodate(struct extent_io_tree *tree,
1594                                struct page *page)
1595 {
1596         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1597         u64 end = start + PAGE_CACHE_SIZE - 1;
1598         if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1599                 SetPageUptodate(page);
1600         return 0;
1601 }
1602
1603 /*
1604  * helper function to unlock a page if all the extents in the tree
1605  * for that page are unlocked
1606  */
1607 static int check_page_locked(struct extent_io_tree *tree,
1608                              struct page *page)
1609 {
1610         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1611         u64 end = start + PAGE_CACHE_SIZE - 1;
1612         if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1613                 unlock_page(page);
1614         return 0;
1615 }
1616
1617 /*
1618  * helper function to end page writeback if all the extents
1619  * in the tree for that page are done with writeback
1620  */
1621 static int check_page_writeback(struct extent_io_tree *tree,
1622                              struct page *page)
1623 {
1624         end_page_writeback(page);
1625         return 0;
1626 }
1627
1628 /* lots and lots of room for performance fixes in the end_bio funcs */
1629
1630 /*
1631  * after a writepage IO is done, we need to:
1632  * clear the uptodate bits on error
1633  * clear the writeback bits in the extent tree for this IO
1634  * end_page_writeback if the page has no more pending IO
1635  *
1636  * Scheduling is not allowed, so the extent state tree is expected
1637  * to have one and only one object corresponding to this IO.
1638  */
1639 static void end_bio_extent_writepage(struct bio *bio, int err)
1640 {
1641         int uptodate = err == 0;
1642         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1643         struct extent_io_tree *tree;
1644         u64 start;
1645         u64 end;
1646         int whole_page;
1647         int ret;
1648
1649         do {
1650                 struct page *page = bvec->bv_page;
1651                 tree = &BTRFS_I(page->mapping->host)->io_tree;
1652
1653                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1654                          bvec->bv_offset;
1655                 end = start + bvec->bv_len - 1;
1656
1657                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1658                         whole_page = 1;
1659                 else
1660                         whole_page = 0;
1661
1662                 if (--bvec >= bio->bi_io_vec)
1663                         prefetchw(&bvec->bv_page->flags);
1664                 if (tree->ops && tree->ops->writepage_end_io_hook) {
1665                         ret = tree->ops->writepage_end_io_hook(page, start,
1666                                                        end, NULL, uptodate);
1667                         if (ret)
1668                                 uptodate = 0;
1669                 }
1670
1671                 if (!uptodate && tree->ops &&
1672                     tree->ops->writepage_io_failed_hook) {
1673                         ret = tree->ops->writepage_io_failed_hook(bio, page,
1674                                                          start, end, NULL);
1675                         if (ret == 0) {
1676                                 uptodate = (err == 0);
1677                                 continue;
1678                         }
1679                 }
1680
1681                 if (!uptodate) {
1682                         clear_extent_uptodate(tree, start, end, GFP_NOFS);
1683                         ClearPageUptodate(page);
1684                         SetPageError(page);
1685                 }
1686
1687                 if (whole_page)
1688                         end_page_writeback(page);
1689                 else
1690                         check_page_writeback(tree, page);
1691         } while (bvec >= bio->bi_io_vec);
1692
1693         bio_put(bio);
1694 }
1695
1696 /*
1697  * after a readpage IO is done, we need to:
1698  * clear the uptodate bits on error
1699  * set the uptodate bits if things worked
1700  * set the page up to date if all extents in the tree are uptodate
1701  * clear the lock bit in the extent tree
1702  * unlock the page if there are no other extents locked for it
1703  *
1704  * Scheduling is not allowed, so the extent state tree is expected
1705  * to have one and only one object corresponding to this IO.
1706  */
1707 static void end_bio_extent_readpage(struct bio *bio, int err)
1708 {
1709         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1710         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1711         struct extent_io_tree *tree;
1712         u64 start;
1713         u64 end;
1714         int whole_page;
1715         int ret;
1716
1717         if (err)
1718                 uptodate = 0;
1719
1720         do {
1721                 struct page *page = bvec->bv_page;
1722                 tree = &BTRFS_I(page->mapping->host)->io_tree;
1723
1724                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1725                         bvec->bv_offset;
1726                 end = start + bvec->bv_len - 1;
1727
1728                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1729                         whole_page = 1;
1730                 else
1731                         whole_page = 0;
1732
1733                 if (--bvec >= bio->bi_io_vec)
1734                         prefetchw(&bvec->bv_page->flags);
1735
1736                 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1737                         ret = tree->ops->readpage_end_io_hook(page, start, end,
1738                                                               NULL);
1739                         if (ret)
1740                                 uptodate = 0;
1741                 }
1742                 if (!uptodate && tree->ops &&
1743                     tree->ops->readpage_io_failed_hook) {
1744                         ret = tree->ops->readpage_io_failed_hook(bio, page,
1745                                                          start, end, NULL);
1746                         if (ret == 0) {
1747                                 uptodate =
1748                                         test_bit(BIO_UPTODATE, &bio->bi_flags);
1749                                 if (err)
1750                                         uptodate = 0;
1751                                 continue;
1752                         }
1753                 }
1754
1755                 if (uptodate) {
1756                         set_extent_uptodate(tree, start, end,
1757                                             GFP_ATOMIC);
1758                 }
1759                 unlock_extent(tree, start, end, GFP_ATOMIC);
1760
1761                 if (whole_page) {
1762                         if (uptodate) {
1763                                 SetPageUptodate(page);
1764                         } else {
1765                                 ClearPageUptodate(page);
1766                                 SetPageError(page);
1767                         }
1768                         unlock_page(page);
1769                 } else {
1770                         if (uptodate) {
1771                                 check_page_uptodate(tree, page);
1772                         } else {
1773                                 ClearPageUptodate(page);
1774                                 SetPageError(page);
1775                         }
1776                         check_page_locked(tree, page);
1777                 }
1778         } while (bvec >= bio->bi_io_vec);
1779
1780         bio_put(bio);
1781 }
1782
1783 /*
1784  * IO done from prepare_write is pretty simple, we just unlock
1785  * the structs in the extent tree when done, and set the uptodate bits
1786  * as appropriate.
1787  */
1788 static void end_bio_extent_preparewrite(struct bio *bio, int err)
1789 {
1790         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1791         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1792         struct extent_io_tree *tree;
1793         u64 start;
1794         u64 end;
1795
1796         do {
1797                 struct page *page = bvec->bv_page;
1798                 tree = &BTRFS_I(page->mapping->host)->io_tree;
1799
1800                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1801                         bvec->bv_offset;
1802                 end = start + bvec->bv_len - 1;
1803
1804                 if (--bvec >= bio->bi_io_vec)
1805                         prefetchw(&bvec->bv_page->flags);
1806
1807                 if (uptodate) {
1808                         set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1809                 } else {
1810                         ClearPageUptodate(page);
1811                         SetPageError(page);
1812                 }
1813
1814                 unlock_extent(tree, start, end, GFP_ATOMIC);
1815
1816         } while (bvec >= bio->bi_io_vec);
1817
1818         bio_put(bio);
1819 }
1820
1821 static struct bio *
1822 extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1823                  gfp_t gfp_flags)
1824 {
1825         struct bio *bio;
1826
1827         bio = bio_alloc(gfp_flags, nr_vecs);
1828
1829         if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1830                 while (!bio && (nr_vecs /= 2))
1831                         bio = bio_alloc(gfp_flags, nr_vecs);
1832         }
1833
1834         if (bio) {
1835                 bio->bi_size = 0;
1836                 bio->bi_bdev = bdev;
1837                 bio->bi_sector = first_sector;
1838         }
1839         return bio;
1840 }
1841
1842 static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1843                           unsigned long bio_flags)
1844 {
1845         int ret = 0;
1846         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1847         struct page *page = bvec->bv_page;
1848         struct extent_io_tree *tree = bio->bi_private;
1849         u64 start;
1850         u64 end;
1851
1852         start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1853         end = start + bvec->bv_len - 1;
1854
1855         bio->bi_private = NULL;
1856
1857         bio_get(bio);
1858
1859         if (tree->ops && tree->ops->submit_bio_hook)
1860                 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1861                                            mirror_num, bio_flags);
1862         else
1863                 submit_bio(rw, bio);
1864         if (bio_flagged(bio, BIO_EOPNOTSUPP))
1865                 ret = -EOPNOTSUPP;
1866         bio_put(bio);
1867         return ret;
1868 }
1869
1870 static int submit_extent_page(int rw, struct extent_io_tree *tree,
1871                               struct page *page, sector_t sector,
1872                               size_t size, unsigned long offset,
1873                               struct block_device *bdev,
1874                               struct bio **bio_ret,
1875                               unsigned long max_pages,
1876                               bio_end_io_t end_io_func,
1877                               int mirror_num,
1878                               unsigned long prev_bio_flags,
1879                               unsigned long bio_flags)
1880 {
1881         int ret = 0;
1882         struct bio *bio;
1883         int nr;
1884         int contig = 0;
1885         int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1886         int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1887         size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1888
1889         if (bio_ret && *bio_ret) {
1890                 bio = *bio_ret;
1891                 if (old_compressed)
1892                         contig = bio->bi_sector == sector;
1893                 else
1894                         contig = bio->bi_sector + (bio->bi_size >> 9) ==
1895                                 sector;
1896
1897                 if (prev_bio_flags != bio_flags || !contig ||
1898                     (tree->ops && tree->ops->merge_bio_hook &&
1899                      tree->ops->merge_bio_hook(page, offset, page_size, bio,
1900                                                bio_flags)) ||
1901                     bio_add_page(bio, page, page_size, offset) < page_size) {
1902                         ret = submit_one_bio(rw, bio, mirror_num,
1903                                              prev_bio_flags);
1904                         bio = NULL;
1905                 } else {
1906                         return 0;
1907                 }
1908         }
1909         if (this_compressed)
1910                 nr = BIO_MAX_PAGES;
1911         else
1912                 nr = bio_get_nr_vecs(bdev);
1913
1914         bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1915
1916         bio_add_page(bio, page, page_size, offset);
1917         bio->bi_end_io = end_io_func;
1918         bio->bi_private = tree;
1919
1920         if (bio_ret)
1921                 *bio_ret = bio;
1922         else
1923                 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1924
1925         return ret;
1926 }
1927
1928 void set_page_extent_mapped(struct page *page)
1929 {
1930         if (!PagePrivate(page)) {
1931                 SetPagePrivate(page);
1932                 page_cache_get(page);
1933                 set_page_private(page, EXTENT_PAGE_PRIVATE);
1934         }
1935 }
1936
1937 static void set_page_extent_head(struct page *page, unsigned long len)
1938 {
1939         set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1940 }
1941
1942 /*
1943  * basic readpage implementation.  Locked extent state structs are inserted
1944  * into the tree that are removed when the IO is done (by the end_io
1945  * handlers)
1946  */
1947 static int __extent_read_full_page(struct extent_io_tree *tree,
1948                                    struct page *page,
1949                                    get_extent_t *get_extent,
1950                                    struct bio **bio, int mirror_num,
1951                                    unsigned long *bio_flags)
1952 {
1953         struct inode *inode = page->mapping->host;
1954         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1955         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1956         u64 end;
1957         u64 cur = start;
1958         u64 extent_offset;
1959         u64 last_byte = i_size_read(inode);
1960         u64 block_start;
1961         u64 cur_end;
1962         sector_t sector;
1963         struct extent_map *em;
1964         struct block_device *bdev;
1965         int ret;
1966         int nr = 0;
1967         size_t page_offset = 0;
1968         size_t iosize;
1969         size_t disk_io_size;
1970         size_t blocksize = inode->i_sb->s_blocksize;
1971         unsigned long this_bio_flag = 0;
1972
1973         set_page_extent_mapped(page);
1974
1975         end = page_end;
1976         lock_extent(tree, start, end, GFP_NOFS);
1977
1978         if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1979                 char *userpage;
1980                 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1981
1982                 if (zero_offset) {
1983                         iosize = PAGE_CACHE_SIZE - zero_offset;
1984                         userpage = kmap_atomic(page, KM_USER0);
1985                         memset(userpage + zero_offset, 0, iosize);
1986                         flush_dcache_page(page);
1987                         kunmap_atomic(userpage, KM_USER0);
1988                 }
1989         }
1990         while (cur <= end) {
1991                 if (cur >= last_byte) {
1992                         char *userpage;
1993                         iosize = PAGE_CACHE_SIZE - page_offset;
1994                         userpage = kmap_atomic(page, KM_USER0);
1995                         memset(userpage + page_offset, 0, iosize);
1996                         flush_dcache_page(page);
1997                         kunmap_atomic(userpage, KM_USER0);
1998                         set_extent_uptodate(tree, cur, cur + iosize - 1,
1999                                             GFP_NOFS);
2000                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2001                         break;
2002                 }
2003                 em = get_extent(inode, page, page_offset, cur,
2004                                 end - cur + 1, 0);
2005                 if (IS_ERR(em) || !em) {
2006                         SetPageError(page);
2007                         unlock_extent(tree, cur, end, GFP_NOFS);
2008                         break;
2009                 }
2010                 extent_offset = cur - em->start;
2011                 BUG_ON(extent_map_end(em) <= cur);
2012                 BUG_ON(end < cur);
2013
2014                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2015                         this_bio_flag = EXTENT_BIO_COMPRESSED;
2016
2017                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2018                 cur_end = min(extent_map_end(em) - 1, end);
2019                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2020                 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2021                         disk_io_size = em->block_len;
2022                         sector = em->block_start >> 9;
2023                 } else {
2024                         sector = (em->block_start + extent_offset) >> 9;
2025                         disk_io_size = iosize;
2026                 }
2027                 bdev = em->bdev;
2028                 block_start = em->block_start;
2029                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2030                         block_start = EXTENT_MAP_HOLE;
2031                 free_extent_map(em);
2032                 em = NULL;
2033
2034                 /* we've found a hole, just zero and go on */
2035                 if (block_start == EXTENT_MAP_HOLE) {
2036                         char *userpage;
2037                         userpage = kmap_atomic(page, KM_USER0);
2038                         memset(userpage + page_offset, 0, iosize);
2039                         flush_dcache_page(page);
2040                         kunmap_atomic(userpage, KM_USER0);
2041
2042                         set_extent_uptodate(tree, cur, cur + iosize - 1,
2043                                             GFP_NOFS);
2044                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2045                         cur = cur + iosize;
2046                         page_offset += iosize;
2047                         continue;
2048                 }
2049                 /* the get_extent function already copied into the page */
2050                 if (test_range_bit(tree, cur, cur_end,
2051                                    EXTENT_UPTODATE, 1, NULL)) {
2052                         check_page_uptodate(tree, page);
2053                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2054                         cur = cur + iosize;
2055                         page_offset += iosize;
2056                         continue;
2057                 }
2058                 /* we have an inline extent but it didn't get marked up
2059                  * to date.  Error out
2060                  */
2061                 if (block_start == EXTENT_MAP_INLINE) {
2062                         SetPageError(page);
2063                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2064                         cur = cur + iosize;
2065                         page_offset += iosize;
2066                         continue;
2067                 }
2068
2069                 ret = 0;
2070                 if (tree->ops && tree->ops->readpage_io_hook) {
2071                         ret = tree->ops->readpage_io_hook(page, cur,
2072                                                           cur + iosize - 1);
2073                 }
2074                 if (!ret) {
2075                         unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2076                         pnr -= page->index;
2077                         ret = submit_extent_page(READ, tree, page,
2078                                          sector, disk_io_size, page_offset,
2079                                          bdev, bio, pnr,
2080                                          end_bio_extent_readpage, mirror_num,
2081                                          *bio_flags,
2082                                          this_bio_flag);
2083                         nr++;
2084                         *bio_flags = this_bio_flag;
2085                 }
2086                 if (ret)
2087                         SetPageError(page);
2088                 cur = cur + iosize;
2089                 page_offset += iosize;
2090         }
2091         if (!nr) {
2092                 if (!PageError(page))
2093                         SetPageUptodate(page);
2094                 unlock_page(page);
2095         }
2096         return 0;
2097 }
2098
2099 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2100                             get_extent_t *get_extent)
2101 {
2102         struct bio *bio = NULL;
2103         unsigned long bio_flags = 0;
2104         int ret;
2105
2106         ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2107                                       &bio_flags);
2108         if (bio)
2109                 submit_one_bio(READ, bio, 0, bio_flags);
2110         return ret;
2111 }
2112
2113 static noinline void update_nr_written(struct page *page,
2114                                       struct writeback_control *wbc,
2115                                       unsigned long nr_written)
2116 {
2117         wbc->nr_to_write -= nr_written;
2118         if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2119             wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2120                 page->mapping->writeback_index = page->index + nr_written;
2121 }
2122
2123 /*
2124  * the writepage semantics are similar to regular writepage.  extent
2125  * records are inserted to lock ranges in the tree, and as dirty areas
2126  * are found, they are marked writeback.  Then the lock bits are removed
2127  * and the end_io handler clears the writeback ranges
2128  */
2129 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2130                               void *data)
2131 {
2132         struct inode *inode = page->mapping->host;
2133         struct extent_page_data *epd = data;
2134         struct extent_io_tree *tree = epd->tree;
2135         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2136         u64 delalloc_start;
2137         u64 page_end = start + PAGE_CACHE_SIZE - 1;
2138         u64 end;
2139         u64 cur = start;
2140         u64 extent_offset;
2141         u64 last_byte = i_size_read(inode);
2142         u64 block_start;
2143         u64 iosize;
2144         u64 unlock_start;
2145         sector_t sector;
2146         struct extent_state *cached_state = NULL;
2147         struct extent_map *em;
2148         struct block_device *bdev;
2149         int ret;
2150         int nr = 0;
2151         size_t pg_offset = 0;
2152         size_t blocksize;
2153         loff_t i_size = i_size_read(inode);
2154         unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2155         u64 nr_delalloc;
2156         u64 delalloc_end;
2157         int page_started;
2158         int compressed;
2159         int write_flags;
2160         unsigned long nr_written = 0;
2161
2162         if (wbc->sync_mode == WB_SYNC_ALL)
2163                 write_flags = WRITE_SYNC_PLUG;
2164         else
2165                 write_flags = WRITE;
2166
2167         WARN_ON(!PageLocked(page));
2168         pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2169         if (page->index > end_index ||
2170            (page->index == end_index && !pg_offset)) {
2171                 page->mapping->a_ops->invalidatepage(page, 0);
2172                 unlock_page(page);
2173                 return 0;
2174         }
2175
2176         if (page->index == end_index) {
2177                 char *userpage;
2178
2179                 userpage = kmap_atomic(page, KM_USER0);
2180                 memset(userpage + pg_offset, 0,
2181                        PAGE_CACHE_SIZE - pg_offset);
2182                 kunmap_atomic(userpage, KM_USER0);
2183                 flush_dcache_page(page);
2184         }
2185         pg_offset = 0;
2186
2187         set_page_extent_mapped(page);
2188
2189         delalloc_start = start;
2190         delalloc_end = 0;
2191         page_started = 0;
2192         if (!epd->extent_locked) {
2193                 u64 delalloc_to_write;
2194                 /*
2195                  * make sure the wbc mapping index is at least updated
2196                  * to this page.
2197                  */
2198                 update_nr_written(page, wbc, 0);
2199
2200                 while (delalloc_end < page_end) {
2201                         nr_delalloc = find_lock_delalloc_range(inode, tree,
2202                                                        page,
2203                                                        &delalloc_start,
2204                                                        &delalloc_end,
2205                                                        128 * 1024 * 1024);
2206                         if (nr_delalloc == 0) {
2207                                 delalloc_start = delalloc_end + 1;
2208                                 continue;
2209                         }
2210                         tree->ops->fill_delalloc(inode, page, delalloc_start,
2211                                                  delalloc_end, &page_started,
2212                                                  &nr_written);
2213                         delalloc_to_write = (delalloc_end -
2214                                         max_t(u64, page_offset(page),
2215                                               delalloc_start) + 1) >>
2216                                         PAGE_CACHE_SHIFT;
2217                         if (wbc->nr_to_write < delalloc_to_write) {
2218                                 wbc->nr_to_write = min_t(long, 8192,
2219                                                  delalloc_to_write);
2220                         }
2221                         delalloc_start = delalloc_end + 1;
2222                 }
2223
2224                 /* did the fill delalloc function already unlock and start
2225                  * the IO?
2226                  */
2227                 if (page_started) {
2228                         ret = 0;
2229                         /*
2230                          * we've unlocked the page, so we can't update
2231                          * the mapping's writeback index, just update
2232                          * nr_to_write.
2233                          */
2234                         wbc->nr_to_write -= nr_written;
2235                         goto done_unlocked;
2236                 }
2237         }
2238         if (tree->ops && tree->ops->writepage_start_hook) {
2239                 ret = tree->ops->writepage_start_hook(page, start,
2240                                                       page_end);
2241                 if (ret == -EAGAIN) {
2242                         redirty_page_for_writepage(wbc, page);
2243                         update_nr_written(page, wbc, nr_written);
2244                         unlock_page(page);
2245                         ret = 0;
2246                         goto done_unlocked;
2247                 }
2248         }
2249
2250         /*
2251          * we don't want to touch the inode after unlocking the page,
2252          * so we update the mapping writeback index now
2253          */
2254         update_nr_written(page, wbc, nr_written + 1);
2255
2256         end = page_end;
2257         if (last_byte <= start) {
2258                 if (tree->ops && tree->ops->writepage_end_io_hook)
2259                         tree->ops->writepage_end_io_hook(page, start,
2260                                                          page_end, NULL, 1);
2261                 unlock_start = page_end + 1;
2262                 goto done;
2263         }
2264
2265         blocksize = inode->i_sb->s_blocksize;
2266
2267         while (cur <= end) {
2268                 if (cur >= last_byte) {
2269                         if (tree->ops && tree->ops->writepage_end_io_hook)
2270                                 tree->ops->writepage_end_io_hook(page, cur,
2271                                                          page_end, NULL, 1);
2272                         unlock_start = page_end + 1;
2273                         break;
2274                 }
2275                 em = epd->get_extent(inode, page, pg_offset, cur,
2276                                      end - cur + 1, 1);
2277                 if (IS_ERR(em) || !em) {
2278                         SetPageError(page);
2279                         break;
2280                 }
2281
2282                 extent_offset = cur - em->start;
2283                 BUG_ON(extent_map_end(em) <= cur);
2284                 BUG_ON(end < cur);
2285                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2286                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2287                 sector = (em->block_start + extent_offset) >> 9;
2288                 bdev = em->bdev;
2289                 block_start = em->block_start;
2290                 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2291                 free_extent_map(em);
2292                 em = NULL;
2293
2294                 /*
2295                  * compressed and inline extents are written through other
2296                  * paths in the FS
2297                  */
2298                 if (compressed || block_start == EXTENT_MAP_HOLE ||
2299                     block_start == EXTENT_MAP_INLINE) {
2300                         /*
2301                          * end_io notification does not happen here for
2302                          * compressed extents
2303                          */
2304                         if (!compressed && tree->ops &&
2305                             tree->ops->writepage_end_io_hook)
2306                                 tree->ops->writepage_end_io_hook(page, cur,
2307                                                          cur + iosize - 1,
2308                                                          NULL, 1);
2309                         else if (compressed) {
2310                                 /* we don't want to end_page_writeback on
2311                                  * a compressed extent.  this happens
2312                                  * elsewhere
2313                                  */
2314                                 nr++;
2315                         }
2316
2317                         cur += iosize;
2318                         pg_offset += iosize;
2319                         unlock_start = cur;
2320                         continue;
2321                 }
2322                 /* leave this out until we have a page_mkwrite call */
2323                 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2324                                    EXTENT_DIRTY, 0, NULL)) {
2325                         cur = cur + iosize;
2326                         pg_offset += iosize;
2327                         continue;
2328                 }
2329
2330                 if (tree->ops && tree->ops->writepage_io_hook) {
2331                         ret = tree->ops->writepage_io_hook(page, cur,
2332                                                 cur + iosize - 1);
2333                 } else {
2334                         ret = 0;
2335                 }
2336                 if (ret) {
2337                         SetPageError(page);
2338                 } else {
2339                         unsigned long max_nr = end_index + 1;
2340
2341                         set_range_writeback(tree, cur, cur + iosize - 1);
2342                         if (!PageWriteback(page)) {
2343                                 printk(KERN_ERR "btrfs warning page %lu not "
2344                                        "writeback, cur %llu end %llu\n",
2345                                        page->index, (unsigned long long)cur,
2346                                        (unsigned long long)end);
2347                         }
2348
2349                         ret = submit_extent_page(write_flags, tree, page,
2350                                                  sector, iosize, pg_offset,
2351                                                  bdev, &epd->bio, max_nr,
2352                                                  end_bio_extent_writepage,
2353                                                  0, 0, 0);
2354                         if (ret)
2355                                 SetPageError(page);
2356                 }
2357                 cur = cur + iosize;
2358                 pg_offset += iosize;
2359                 nr++;
2360         }
2361 done:
2362         if (nr == 0) {
2363                 /* make sure the mapping tag for page dirty gets cleared */
2364                 set_page_writeback(page);
2365                 end_page_writeback(page);
2366         }
2367         unlock_page(page);
2368
2369 done_unlocked:
2370
2371         /* drop our reference on any cached states */
2372         free_extent_state(cached_state);
2373         return 0;
2374 }
2375
2376 /**
2377  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2378  * @mapping: address space structure to write
2379  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2380  * @writepage: function called for each page
2381  * @data: data passed to writepage function
2382  *
2383  * If a page is already under I/O, write_cache_pages() skips it, even
2384  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2385  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2386  * and msync() need to guarantee that all the data which was dirty at the time
2387  * the call was made get new I/O started against them.  If wbc->sync_mode is
2388  * WB_SYNC_ALL then we were called for data integrity and we must wait for
2389  * existing IO to complete.
2390  */
2391 static int extent_write_cache_pages(struct extent_io_tree *tree,
2392                              struct address_space *mapping,
2393                              struct writeback_control *wbc,
2394                              writepage_t writepage, void *data,
2395                              void (*flush_fn)(void *))
2396 {
2397         int ret = 0;
2398         int done = 0;
2399         struct pagevec pvec;
2400         int nr_pages;
2401         pgoff_t index;
2402         pgoff_t end;            /* Inclusive */
2403         int scanned = 0;
2404         int range_whole = 0;
2405
2406         pagevec_init(&pvec, 0);
2407         if (wbc->range_cyclic) {
2408                 index = mapping->writeback_index; /* Start from prev offset */
2409                 end = -1;
2410         } else {
2411                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2412                 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2413                 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2414                         range_whole = 1;
2415                 scanned = 1;
2416         }
2417 retry:
2418         while (!done && (index <= end) &&
2419                (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2420                               PAGECACHE_TAG_DIRTY, min(end - index,
2421                                   (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2422                 unsigned i;
2423
2424                 scanned = 1;
2425                 for (i = 0; i < nr_pages; i++) {
2426                         struct page *page = pvec.pages[i];
2427
2428                         /*
2429                          * At this point we hold neither mapping->tree_lock nor
2430                          * lock on the page itself: the page may be truncated or
2431                          * invalidated (changing page->mapping to NULL), or even
2432                          * swizzled back from swapper_space to tmpfs file
2433                          * mapping
2434                          */
2435                         if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2436                                 tree->ops->write_cache_pages_lock_hook(page);
2437                         else
2438                                 lock_page(page);
2439
2440                         if (unlikely(page->mapping != mapping)) {
2441                                 unlock_page(page);
2442                                 continue;
2443                         }
2444
2445                         if (!wbc->range_cyclic && page->index > end) {
2446                                 done = 1;
2447                                 unlock_page(page);
2448                                 continue;
2449                         }
2450
2451                         if (wbc->sync_mode != WB_SYNC_NONE) {
2452                                 if (PageWriteback(page))
2453                                         flush_fn(data);
2454                                 wait_on_page_writeback(page);
2455                         }
2456
2457                         if (PageWriteback(page) ||
2458                             !clear_page_dirty_for_io(page)) {
2459                                 unlock_page(page);
2460                                 continue;
2461                         }
2462
2463                         ret = (*writepage)(page, wbc, data);
2464
2465                         if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2466                                 unlock_page(page);
2467                                 ret = 0;
2468                         }
2469                         if (ret || wbc->nr_to_write <= 0)
2470                                 done = 1;
2471                 }
2472                 pagevec_release(&pvec);
2473                 cond_resched();
2474         }
2475         if (!scanned && !done) {
2476                 /*
2477                  * We hit the last page and there is more work to be done: wrap
2478                  * back to the start of the file
2479                  */
2480                 scanned = 1;
2481                 index = 0;
2482                 goto retry;
2483         }
2484         return ret;
2485 }
2486
2487 static void flush_epd_write_bio(struct extent_page_data *epd)
2488 {
2489         if (epd->bio) {
2490                 if (epd->sync_io)
2491                         submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
2492                 else
2493                         submit_one_bio(WRITE, epd->bio, 0, 0);
2494                 epd->bio = NULL;
2495         }
2496 }
2497
2498 static noinline void flush_write_bio(void *data)
2499 {
2500         struct extent_page_data *epd = data;
2501         flush_epd_write_bio(epd);
2502 }
2503
2504 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2505                           get_extent_t *get_extent,
2506                           struct writeback_control *wbc)
2507 {
2508         int ret;
2509         struct address_space *mapping = page->mapping;
2510         struct extent_page_data epd = {
2511                 .bio = NULL,
2512                 .tree = tree,
2513                 .get_extent = get_extent,
2514                 .extent_locked = 0,
2515                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2516         };
2517         struct writeback_control wbc_writepages = {
2518                 .bdi            = wbc->bdi,
2519                 .sync_mode      = wbc->sync_mode,
2520                 .older_than_this = NULL,
2521                 .nr_to_write    = 64,
2522                 .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
2523                 .range_end      = (loff_t)-1,
2524         };
2525
2526         ret = __extent_writepage(page, wbc, &epd);
2527
2528         extent_write_cache_pages(tree, mapping, &wbc_writepages,
2529                                  __extent_writepage, &epd, flush_write_bio);
2530         flush_epd_write_bio(&epd);
2531         return ret;
2532 }
2533
2534 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2535                               u64 start, u64 end, get_extent_t *get_extent,
2536                               int mode)
2537 {
2538         int ret = 0;
2539         struct address_space *mapping = inode->i_mapping;
2540         struct page *page;
2541         unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2542                 PAGE_CACHE_SHIFT;
2543
2544         struct extent_page_data epd = {
2545                 .bio = NULL,
2546                 .tree = tree,
2547                 .get_extent = get_extent,
2548                 .extent_locked = 1,
2549                 .sync_io = mode == WB_SYNC_ALL,
2550         };
2551         struct writeback_control wbc_writepages = {
2552                 .bdi            = inode->i_mapping->backing_dev_info,
2553                 .sync_mode      = mode,
2554                 .older_than_this = NULL,
2555                 .nr_to_write    = nr_pages * 2,
2556                 .range_start    = start,
2557                 .range_end      = end + 1,
2558         };
2559
2560         while (start <= end) {
2561                 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2562                 if (clear_page_dirty_for_io(page))
2563                         ret = __extent_writepage(page, &wbc_writepages, &epd);
2564                 else {
2565                         if (tree->ops && tree->ops->writepage_end_io_hook)
2566                                 tree->ops->writepage_end_io_hook(page, start,
2567                                                  start + PAGE_CACHE_SIZE - 1,
2568                                                  NULL, 1);
2569                         unlock_page(page);
2570                 }
2571                 page_cache_release(page);
2572                 start += PAGE_CACHE_SIZE;
2573         }
2574
2575         flush_epd_write_bio(&epd);
2576         return ret;
2577 }
2578
2579 int extent_writepages(struct extent_io_tree *tree,
2580                       struct address_space *mapping,
2581                       get_extent_t *get_extent,
2582                       struct writeback_control *wbc)
2583 {
2584         int ret = 0;
2585         struct extent_page_data epd = {
2586                 .bio = NULL,
2587                 .tree = tree,
2588                 .get_extent = get_extent,
2589                 .extent_locked = 0,
2590                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2591         };
2592
2593         ret = extent_write_cache_pages(tree, mapping, wbc,
2594                                        __extent_writepage, &epd,
2595                                        flush_write_bio);
2596         flush_epd_write_bio(&epd);
2597         return ret;
2598 }
2599
2600 int extent_readpages(struct extent_io_tree *tree,
2601                      struct address_space *mapping,
2602                      struct list_head *pages, unsigned nr_pages,
2603                      get_extent_t get_extent)
2604 {
2605         struct bio *bio = NULL;
2606         unsigned page_idx;
2607         struct pagevec pvec;
2608         unsigned long bio_flags = 0;
2609
2610         pagevec_init(&pvec, 0);
2611         for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2612                 struct page *page = list_entry(pages->prev, struct page, lru);
2613
2614                 prefetchw(&page->flags);
2615                 list_del(&page->lru);
2616                 /*
2617                  * what we want to do here is call add_to_page_cache_lru,
2618                  * but that isn't exported, so we reproduce it here
2619                  */
2620                 if (!add_to_page_cache(page, mapping,
2621                                         page->index, GFP_KERNEL)) {
2622
2623                         /* open coding of lru_cache_add, also not exported */
2624                         page_cache_get(page);
2625                         if (!pagevec_add(&pvec, page))
2626                                 __pagevec_lru_add_file(&pvec);
2627                         __extent_read_full_page(tree, page, get_extent,
2628                                                 &bio, 0, &bio_flags);
2629                 }
2630                 page_cache_release(page);
2631         }
2632         if (pagevec_count(&pvec))
2633                 __pagevec_lru_add_file(&pvec);
2634         BUG_ON(!list_empty(pages));
2635         if (bio)
2636                 submit_one_bio(READ, bio, 0, bio_flags);
2637         return 0;
2638 }
2639
2640 /*
2641  * basic invalidatepage code, this waits on any locked or writeback
2642  * ranges corresponding to the page, and then deletes any extent state
2643  * records from the tree
2644  */
2645 int extent_invalidatepage(struct extent_io_tree *tree,
2646                           struct page *page, unsigned long offset)
2647 {
2648         u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2649         u64 end = start + PAGE_CACHE_SIZE - 1;
2650         size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2651
2652         start += (offset + blocksize - 1) & ~(blocksize - 1);
2653         if (start > end)
2654                 return 0;
2655
2656         lock_extent(tree, start, end, GFP_NOFS);
2657         wait_on_page_writeback(page);
2658         clear_extent_bit(tree, start, end,
2659                          EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2660                          1, 1, NULL, GFP_NOFS);
2661         return 0;
2662 }
2663
2664 /*
2665  * simple commit_write call, set_range_dirty is used to mark both
2666  * the pages and the extent records as dirty
2667  */
2668 int extent_commit_write(struct extent_io_tree *tree,
2669                         struct inode *inode, struct page *page,
2670                         unsigned from, unsigned to)
2671 {
2672         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2673
2674         set_page_extent_mapped(page);
2675         set_page_dirty(page);
2676
2677         if (pos > inode->i_size) {
2678                 i_size_write(inode, pos);
2679                 mark_inode_dirty(inode);
2680         }
2681         return 0;
2682 }
2683
2684 int extent_prepare_write(struct extent_io_tree *tree,
2685                          struct inode *inode, struct page *page,
2686                          unsigned from, unsigned to, get_extent_t *get_extent)
2687 {
2688         u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2689         u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2690         u64 block_start;
2691         u64 orig_block_start;
2692         u64 block_end;
2693         u64 cur_end;
2694         struct extent_map *em;
2695         unsigned blocksize = 1 << inode->i_blkbits;
2696         size_t page_offset = 0;
2697         size_t block_off_start;
2698         size_t block_off_end;
2699         int err = 0;
2700         int iocount = 0;
2701         int ret = 0;
2702         int isnew;
2703
2704         set_page_extent_mapped(page);
2705
2706         block_start = (page_start + from) & ~((u64)blocksize - 1);
2707         block_end = (page_start + to - 1) | (blocksize - 1);
2708         orig_block_start = block_start;
2709
2710         lock_extent(tree, page_start, page_end, GFP_NOFS);
2711         while (block_start <= block_end) {
2712                 em = get_extent(inode, page, page_offset, block_start,
2713                                 block_end - block_start + 1, 1);
2714                 if (IS_ERR(em) || !em)
2715                         goto err;
2716
2717                 cur_end = min(block_end, extent_map_end(em) - 1);
2718                 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2719                 block_off_end = block_off_start + blocksize;
2720                 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2721
2722                 if (!PageUptodate(page) && isnew &&
2723                     (block_off_end > to || block_off_start < from)) {
2724                         void *kaddr;
2725
2726                         kaddr = kmap_atomic(page, KM_USER0);
2727                         if (block_off_end > to)
2728                                 memset(kaddr + to, 0, block_off_end - to);
2729                         if (block_off_start < from)
2730                                 memset(kaddr + block_off_start, 0,
2731                                        from - block_off_start);
2732                         flush_dcache_page(page);
2733                         kunmap_atomic(kaddr, KM_USER0);
2734                 }
2735                 if ((em->block_start != EXTENT_MAP_HOLE &&
2736                      em->block_start != EXTENT_MAP_INLINE) &&
2737                     !isnew && !PageUptodate(page) &&
2738                     (block_off_end > to || block_off_start < from) &&
2739                     !test_range_bit(tree, block_start, cur_end,
2740                                     EXTENT_UPTODATE, 1, NULL)) {
2741                         u64 sector;
2742                         u64 extent_offset = block_start - em->start;
2743                         size_t iosize;
2744                         sector = (em->block_start + extent_offset) >> 9;
2745                         iosize = (cur_end - block_start + blocksize) &
2746                                 ~((u64)blocksize - 1);
2747                         /*
2748                          * we've already got the extent locked, but we
2749                          * need to split the state such that our end_bio
2750                          * handler can clear the lock.
2751                          */
2752                         set_extent_bit(tree, block_start,
2753                                        block_start + iosize - 1,
2754                                        EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
2755                         ret = submit_extent_page(READ, tree, page,
2756                                          sector, iosize, page_offset, em->bdev,
2757                                          NULL, 1,
2758                                          end_bio_extent_preparewrite, 0,
2759                                          0, 0);
2760                         iocount++;
2761                         block_start = block_start + iosize;
2762                 } else {
2763                         set_extent_uptodate(tree, block_start, cur_end,
2764                                             GFP_NOFS);
2765                         unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2766                         block_start = cur_end + 1;
2767                 }
2768                 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2769                 free_extent_map(em);
2770         }
2771         if (iocount) {
2772                 wait_extent_bit(tree, orig_block_start,
2773                                 block_end, EXTENT_LOCKED);
2774         }
2775         check_page_uptodate(tree, page);
2776 err:
2777         /* FIXME, zero out newly allocated blocks on error */
2778         return err;
2779 }
2780
2781 /*
2782  * a helper for releasepage, this tests for areas of the page that
2783  * are locked or under IO and drops the related state bits if it is safe
2784  * to drop the page.
2785  */
2786 int try_release_extent_state(struct extent_map_tree *map,
2787                              struct extent_io_tree *tree, struct page *page,
2788                              gfp_t mask)
2789 {
2790         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2791         u64 end = start + PAGE_CACHE_SIZE - 1;
2792         int ret = 1;
2793
2794         if (test_range_bit(tree, start, end,
2795                            EXTENT_IOBITS | EXTENT_ORDERED, 0, NULL))
2796                 ret = 0;
2797         else {
2798                 if ((mask & GFP_NOFS) == GFP_NOFS)
2799                         mask = GFP_NOFS;
2800                 clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
2801                                  1, 1, NULL, mask);
2802         }
2803         return ret;
2804 }
2805
2806 /*
2807  * a helper for releasepage.  As long as there are no locked extents
2808  * in the range corresponding to the page, both state records and extent
2809  * map records are removed
2810  */
2811 int try_release_extent_mapping(struct extent_map_tree *map,
2812                                struct extent_io_tree *tree, struct page *page,
2813                                gfp_t mask)
2814 {
2815         struct extent_map *em;
2816         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2817         u64 end = start + PAGE_CACHE_SIZE - 1;
2818
2819         if ((mask & __GFP_WAIT) &&
2820             page->mapping->host->i_size > 16 * 1024 * 1024) {
2821                 u64 len;
2822                 while (start <= end) {
2823                         len = end - start + 1;
2824                         write_lock(&map->lock);
2825                         em = lookup_extent_mapping(map, start, len);
2826                         if (!em || IS_ERR(em)) {
2827                                 write_unlock(&map->lock);
2828                                 break;
2829                         }
2830                         if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2831                             em->start != start) {
2832                                 write_unlock(&map->lock);
2833                                 free_extent_map(em);
2834                                 break;
2835                         }
2836                         if (!test_range_bit(tree, em->start,
2837                                             extent_map_end(em) - 1,
2838                                             EXTENT_LOCKED | EXTENT_WRITEBACK |
2839                                             EXTENT_ORDERED,
2840                                             0, NULL)) {
2841                                 remove_extent_mapping(map, em);
2842                                 /* once for the rb tree */
2843                                 free_extent_map(em);
2844                         }
2845                         start = extent_map_end(em);
2846                         write_unlock(&map->lock);
2847
2848                         /* once for us */
2849                         free_extent_map(em);
2850                 }
2851         }
2852         return try_release_extent_state(map, tree, page, mask);
2853 }
2854
2855 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2856                 get_extent_t *get_extent)
2857 {
2858         struct inode *inode = mapping->host;
2859         u64 start = iblock << inode->i_blkbits;
2860         sector_t sector = 0;
2861         size_t blksize = (1 << inode->i_blkbits);
2862         struct extent_map *em;
2863
2864         lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2865                     GFP_NOFS);
2866         em = get_extent(inode, NULL, 0, start, blksize, 0);
2867         unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2868                       GFP_NOFS);
2869         if (!em || IS_ERR(em))
2870                 return 0;
2871
2872         if (em->block_start > EXTENT_MAP_LAST_BYTE)
2873                 goto out;
2874
2875         sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2876 out:
2877         free_extent_map(em);
2878         return sector;
2879 }
2880
2881 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2882                 __u64 start, __u64 len, get_extent_t *get_extent)
2883 {
2884         int ret;
2885         u64 off = start;
2886         u64 max = start + len;
2887         u32 flags = 0;
2888         u64 disko = 0;
2889         struct extent_map *em = NULL;
2890         int end = 0;
2891         u64 em_start = 0, em_len = 0;
2892         unsigned long emflags;
2893         ret = 0;
2894
2895         if (len == 0)
2896                 return -EINVAL;
2897
2898         lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2899                 GFP_NOFS);
2900         em = get_extent(inode, NULL, 0, off, max - off, 0);
2901         if (!em)
2902                 goto out;
2903         if (IS_ERR(em)) {
2904                 ret = PTR_ERR(em);
2905                 goto out;
2906         }
2907         while (!end) {
2908                 off = em->start + em->len;
2909                 if (off >= max)
2910                         end = 1;
2911
2912                 em_start = em->start;
2913                 em_len = em->len;
2914
2915                 disko = 0;
2916                 flags = 0;
2917
2918                 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2919                         end = 1;
2920                         flags |= FIEMAP_EXTENT_LAST;
2921                 } else if (em->block_start == EXTENT_MAP_HOLE) {
2922                         flags |= FIEMAP_EXTENT_UNWRITTEN;
2923                 } else if (em->block_start == EXTENT_MAP_INLINE) {
2924                         flags |= (FIEMAP_EXTENT_DATA_INLINE |
2925                                   FIEMAP_EXTENT_NOT_ALIGNED);
2926                 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
2927                         flags |= (FIEMAP_EXTENT_DELALLOC |
2928                                   FIEMAP_EXTENT_UNKNOWN);
2929                 } else {
2930                         disko = em->block_start;
2931                 }
2932                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2933                         flags |= FIEMAP_EXTENT_ENCODED;
2934
2935                 emflags = em->flags;
2936                 free_extent_map(em);
2937                 em = NULL;
2938
2939                 if (!end) {
2940                         em = get_extent(inode, NULL, 0, off, max - off, 0);
2941                         if (!em)
2942                                 goto out;
2943                         if (IS_ERR(em)) {
2944                                 ret = PTR_ERR(em);
2945                                 goto out;
2946                         }
2947                         emflags = em->flags;
2948                 }
2949                 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
2950                         flags |= FIEMAP_EXTENT_LAST;
2951                         end = 1;
2952                 }
2953
2954                 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
2955                                         em_len, flags);
2956                 if (ret)
2957                         goto out_free;
2958         }
2959 out_free:
2960         free_extent_map(em);
2961 out:
2962         unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2963                         GFP_NOFS);
2964         return ret;
2965 }
2966
2967 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2968                                               unsigned long i)
2969 {
2970         struct page *p;
2971         struct address_space *mapping;
2972
2973         if (i == 0)
2974                 return eb->first_page;
2975         i += eb->start >> PAGE_CACHE_SHIFT;
2976         mapping = eb->first_page->mapping;
2977         if (!mapping)
2978                 return NULL;
2979
2980         /*
2981          * extent_buffer_page is only called after pinning the page
2982          * by increasing the reference count.  So we know the page must
2983          * be in the radix tree.
2984          */
2985         rcu_read_lock();
2986         p = radix_tree_lookup(&mapping->page_tree, i);
2987         rcu_read_unlock();
2988
2989         return p;
2990 }
2991
2992 static inline unsigned long num_extent_pages(u64 start, u64 len)
2993 {
2994         return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2995                 (start >> PAGE_CACHE_SHIFT);
2996 }
2997
2998 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2999                                                    u64 start,
3000                                                    unsigned long len,
3001                                                    gfp_t mask)
3002 {
3003         struct extent_buffer *eb = NULL;
3004 #if LEAK_DEBUG
3005         unsigned long flags;
3006 #endif
3007
3008         eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3009         eb->start = start;
3010         eb->len = len;
3011         spin_lock_init(&eb->lock);
3012         init_waitqueue_head(&eb->lock_wq);
3013
3014 #if LEAK_DEBUG
3015         spin_lock_irqsave(&leak_lock, flags);
3016         list_add(&eb->leak_list, &buffers);
3017         spin_unlock_irqrestore(&leak_lock, flags);
3018 #endif
3019         atomic_set(&eb->refs, 1);
3020
3021         return eb;
3022 }
3023
3024 static void __free_extent_buffer(struct extent_buffer *eb)
3025 {
3026 #if LEAK_DEBUG
3027         unsigned long flags;
3028         spin_lock_irqsave(&leak_lock, flags);
3029         list_del(&eb->leak_list);
3030         spin_unlock_irqrestore(&leak_lock, flags);
3031 #endif
3032         kmem_cache_free(extent_buffer_cache, eb);
3033 }
3034
3035 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3036                                           u64 start, unsigned long len,
3037                                           struct page *page0,
3038                                           gfp_t mask)
3039 {
3040         unsigned long num_pages = num_extent_pages(start, len);
3041         unsigned long i;
3042         unsigned long index = start >> PAGE_CACHE_SHIFT;
3043         struct extent_buffer *eb;
3044         struct extent_buffer *exists = NULL;
3045         struct page *p;
3046         struct address_space *mapping = tree->mapping;
3047         int uptodate = 1;
3048
3049         spin_lock(&tree->buffer_lock);
3050         eb = buffer_search(tree, start);
3051         if (eb) {
3052                 atomic_inc(&eb->refs);
3053                 spin_unlock(&tree->buffer_lock);
3054                 mark_page_accessed(eb->first_page);
3055                 return eb;
3056         }
3057         spin_unlock(&tree->buffer_lock);
3058
3059         eb = __alloc_extent_buffer(tree, start, len, mask);
3060         if (!eb)
3061                 return NULL;
3062
3063         if (page0) {
3064                 eb->first_page = page0;
3065                 i = 1;
3066                 index++;
3067                 page_cache_get(page0);
3068                 mark_page_accessed(page0);
3069                 set_page_extent_mapped(page0);
3070                 set_page_extent_head(page0, len);
3071                 uptodate = PageUptodate(page0);
3072         } else {
3073                 i = 0;
3074         }
3075         for (; i < num_pages; i++, index++) {
3076                 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
3077                 if (!p) {
3078                         WARN_ON(1);
3079                         goto free_eb;
3080                 }
3081                 set_page_extent_mapped(p);
3082                 mark_page_accessed(p);
3083                 if (i == 0) {
3084                         eb->first_page = p;
3085                         set_page_extent_head(p, len);
3086                 } else {
3087                         set_page_private(p, EXTENT_PAGE_PRIVATE);
3088                 }
3089                 if (!PageUptodate(p))
3090                         uptodate = 0;
3091                 unlock_page(p);
3092         }
3093         if (uptodate)
3094                 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3095
3096         spin_lock(&tree->buffer_lock);
3097         exists = buffer_tree_insert(tree, start, &eb->rb_node);
3098         if (exists) {
3099                 /* add one reference for the caller */
3100                 atomic_inc(&exists->refs);
3101                 spin_unlock(&tree->buffer_lock);
3102                 goto free_eb;
3103         }
3104         spin_unlock(&tree->buffer_lock);
3105
3106         /* add one reference for the tree */
3107         atomic_inc(&eb->refs);
3108         return eb;
3109
3110 free_eb:
3111         if (!atomic_dec_and_test(&eb->refs))
3112                 return exists;
3113         for (index = 1; index < i; index++)
3114                 page_cache_release(extent_buffer_page(eb, index));
3115         page_cache_release(extent_buffer_page(eb, 0));
3116         __free_extent_buffer(eb);
3117         return exists;
3118 }
3119
3120 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3121                                          u64 start, unsigned long len,
3122                                           gfp_t mask)
3123 {
3124         struct extent_buffer *eb;
3125
3126         spin_lock(&tree->buffer_lock);
3127         eb = buffer_search(tree, start);
3128         if (eb)
3129                 atomic_inc(&eb->refs);
3130         spin_unlock(&tree->buffer_lock);
3131
3132         if (eb)
3133                 mark_page_accessed(eb->first_page);
3134
3135         return eb;
3136 }
3137
3138 void free_extent_buffer(struct extent_buffer *eb)
3139 {
3140         if (!eb)
3141                 return;
3142
3143         if (!atomic_dec_and_test(&eb->refs))
3144                 return;
3145
3146         WARN_ON(1);
3147 }
3148
3149 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3150                               struct extent_buffer *eb)
3151 {
3152         unsigned long i;
3153         unsigned long num_pages;
3154         struct page *page;
3155
3156         num_pages = num_extent_pages(eb->start, eb->len);
3157
3158         for (i = 0; i < num_pages; i++) {
3159                 page = extent_buffer_page(eb, i);
3160                 if (!PageDirty(page))
3161                         continue;
3162
3163                 lock_page(page);
3164                 if (i == 0)
3165                         set_page_extent_head(page, eb->len);
3166                 else
3167                         set_page_private(page, EXTENT_PAGE_PRIVATE);
3168
3169                 clear_page_dirty_for_io(page);
3170                 spin_lock_irq(&page->mapping->tree_lock);
3171                 if (!PageDirty(page)) {
3172                         radix_tree_tag_clear(&page->mapping->page_tree,
3173                                                 page_index(page),
3174                                                 PAGECACHE_TAG_DIRTY);
3175                 }
3176                 spin_unlock_irq(&page->mapping->tree_lock);
3177                 unlock_page(page);
3178         }
3179         return 0;
3180 }
3181
3182 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
3183                                     struct extent_buffer *eb)
3184 {
3185         return wait_on_extent_writeback(tree, eb->start,
3186                                         eb->start + eb->len - 1);
3187 }
3188
3189 int set_extent_buffer_dirty(struct extent_io_tree *tree,
3190                              struct extent_buffer *eb)
3191 {
3192         unsigned long i;
3193         unsigned long num_pages;
3194         int was_dirty = 0;
3195
3196         was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3197         num_pages = num_extent_pages(eb->start, eb->len);
3198         for (i = 0; i < num_pages; i++)
3199                 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3200         return was_dirty;
3201 }
3202
3203 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3204                                 struct extent_buffer *eb)
3205 {
3206         unsigned long i;
3207         struct page *page;
3208         unsigned long num_pages;
3209
3210         num_pages = num_extent_pages(eb->start, eb->len);
3211         clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3212
3213         clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3214                               GFP_NOFS);
3215         for (i = 0; i < num_pages; i++) {
3216                 page = extent_buffer_page(eb, i);
3217                 if (page)
3218                         ClearPageUptodate(page);
3219         }
3220         return 0;
3221 }
3222
3223 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3224                                 struct extent_buffer *eb)
3225 {
3226         unsigned long i;
3227         struct page *page;
3228         unsigned long num_pages;
3229
3230         num_pages = num_extent_pages(eb->start, eb->len);
3231
3232         set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3233                             GFP_NOFS);
3234         for (i = 0; i < num_pages; i++) {
3235                 page = extent_buffer_page(eb, i);
3236                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3237                     ((i == num_pages - 1) &&
3238                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3239                         check_page_uptodate(tree, page);
3240                         continue;
3241                 }
3242                 SetPageUptodate(page);
3243         }
3244         return 0;
3245 }
3246
3247 int extent_range_uptodate(struct extent_io_tree *tree,
3248                           u64 start, u64 end)
3249 {
3250         struct page *page;
3251         int ret;
3252         int pg_uptodate = 1;
3253         int uptodate;
3254         unsigned long index;
3255
3256         ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
3257         if (ret)
3258                 return 1;
3259         while (start <= end) {
3260                 index = start >> PAGE_CACHE_SHIFT;
3261                 page = find_get_page(tree->mapping, index);
3262                 uptodate = PageUptodate(page);
3263                 page_cache_release(page);
3264                 if (!uptodate) {
3265                         pg_uptodate = 0;
3266                         break;
3267                 }
3268                 start += PAGE_CACHE_SIZE;
3269         }
3270         return pg_uptodate;
3271 }
3272
3273 int extent_buffer_uptodate(struct extent_io_tree *tree,
3274                            struct extent_buffer *eb)
3275 {
3276         int ret = 0;
3277         unsigned long num_pages;
3278         unsigned long i;
3279         struct page *page;
3280         int pg_uptodate = 1;
3281
3282         if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3283                 return 1;
3284
3285         ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3286                            EXTENT_UPTODATE, 1, NULL);
3287         if (ret)
3288                 return ret;
3289
3290         num_pages = num_extent_pages(eb->start, eb->len);
3291         for (i = 0; i < num_pages; i++) {
3292                 page = extent_buffer_page(eb, i);
3293                 if (!PageUptodate(page)) {
3294                         pg_uptodate = 0;
3295                         break;
3296                 }
3297         }
3298         return pg_uptodate;
3299 }
3300
3301 int read_extent_buffer_pages(struct extent_io_tree *tree,
3302                              struct extent_buffer *eb,
3303                              u64 start, int wait,
3304                              get_extent_t *get_extent, int mirror_num)
3305 {
3306         unsigned long i;
3307         unsigned long start_i;
3308         struct page *page;
3309         int err;
3310         int ret = 0;
3311         int locked_pages = 0;
3312         int all_uptodate = 1;
3313         int inc_all_pages = 0;
3314         unsigned long num_pages;
3315         struct bio *bio = NULL;
3316         unsigned long bio_flags = 0;
3317
3318         if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3319                 return 0;
3320
3321         if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3322                            EXTENT_UPTODATE, 1, NULL)) {
3323                 return 0;
3324         }
3325
3326         if (start) {
3327                 WARN_ON(start < eb->start);
3328                 start_i = (start >> PAGE_CACHE_SHIFT) -
3329                         (eb->start >> PAGE_CACHE_SHIFT);
3330         } else {
3331                 start_i = 0;
3332         }
3333
3334         num_pages = num_extent_pages(eb->start, eb->len);
3335         for (i = start_i; i < num_pages; i++) {
3336                 page = extent_buffer_page(eb, i);
3337                 if (!wait) {
3338                         if (!trylock_page(page))
3339                                 goto unlock_exit;
3340                 } else {
3341                         lock_page(page);
3342                 }
3343                 locked_pages++;
3344                 if (!PageUptodate(page))
3345                         all_uptodate = 0;
3346         }
3347         if (all_uptodate) {
3348                 if (start_i == 0)
3349                         set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3350                 goto unlock_exit;
3351         }
3352
3353         for (i = start_i; i < num_pages; i++) {
3354                 page = extent_buffer_page(eb, i);
3355                 if (inc_all_pages)
3356                         page_cache_get(page);
3357                 if (!PageUptodate(page)) {
3358                         if (start_i == 0)
3359                                 inc_all_pages = 1;
3360                         ClearPageError(page);
3361                         err = __extent_read_full_page(tree, page,
3362                                                       get_extent, &bio,
3363                                                       mirror_num, &bio_flags);
3364                         if (err)
3365                                 ret = err;
3366                 } else {
3367                         unlock_page(page);
3368                 }
3369         }
3370
3371         if (bio)
3372                 submit_one_bio(READ, bio, mirror_num, bio_flags);
3373
3374         if (ret || !wait)
3375                 return ret;
3376
3377         for (i = start_i; i < num_pages; i++) {
3378                 page = extent_buffer_page(eb, i);
3379                 wait_on_page_locked(page);
3380                 if (!PageUptodate(page))
3381                         ret = -EIO;
3382         }
3383
3384         if (!ret)
3385                 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3386         return ret;
3387
3388 unlock_exit:
3389         i = start_i;
3390         while (locked_pages > 0) {
3391                 page = extent_buffer_page(eb, i);
3392                 i++;
3393                 unlock_page(page);
3394                 locked_pages--;
3395         }
3396         return ret;
3397 }
3398
3399 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3400                         unsigned long start,
3401                         unsigned long len)
3402 {
3403         size_t cur;
3404         size_t offset;
3405         struct page *page;
3406         char *kaddr;
3407         char *dst = (char *)dstv;
3408         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3409         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3410
3411         WARN_ON(start > eb->len);
3412         WARN_ON(start + len > eb->start + eb->len);
3413
3414         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3415
3416         while (len > 0) {
3417                 page = extent_buffer_page(eb, i);
3418
3419                 cur = min(len, (PAGE_CACHE_SIZE - offset));
3420                 kaddr = kmap_atomic(page, KM_USER1);
3421                 memcpy(dst, kaddr + offset, cur);
3422                 kunmap_atomic(kaddr, KM_USER1);
3423
3424                 dst += cur;
3425                 len -= cur;
3426                 offset = 0;
3427                 i++;
3428         }
3429 }
3430
3431 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3432                                unsigned long min_len, char **token, char **map,
3433                                unsigned long *map_start,
3434                                unsigned long *map_len, int km)
3435 {
3436         size_t offset = start & (PAGE_CACHE_SIZE - 1);
3437         char *kaddr;
3438         struct page *p;
3439         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3440         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3441         unsigned long end_i = (start_offset + start + min_len - 1) >>
3442                 PAGE_CACHE_SHIFT;
3443
3444         if (i != end_i)
3445                 return -EINVAL;
3446
3447         if (i == 0) {
3448                 offset = start_offset;
3449                 *map_start = 0;
3450         } else {
3451                 offset = 0;
3452                 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3453         }
3454
3455         if (start + min_len > eb->len) {
3456                 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
3457                        "wanted %lu %lu\n", (unsigned long long)eb->start,
3458                        eb->len, start, min_len);
3459                 WARN_ON(1);
3460         }
3461
3462         p = extent_buffer_page(eb, i);
3463         kaddr = kmap_atomic(p, km);
3464         *token = kaddr;
3465         *map = kaddr + offset;
3466         *map_len = PAGE_CACHE_SIZE - offset;
3467         return 0;
3468 }
3469
3470 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3471                       unsigned long min_len,
3472                       char **token, char **map,
3473                       unsigned long *map_start,
3474                       unsigned long *map_len, int km)
3475 {
3476         int err;
3477         int save = 0;
3478         if (eb->map_token) {
3479                 unmap_extent_buffer(eb, eb->map_token, km);
3480                 eb->map_token = NULL;
3481                 save = 1;
3482         }
3483         err = map_private_extent_buffer(eb, start, min_len, token, map,
3484                                        map_start, map_len, km);
3485         if (!err && save) {
3486                 eb->map_token = *token;
3487                 eb->kaddr = *map;
3488                 eb->map_start = *map_start;
3489                 eb->map_len = *map_len;
3490         }
3491         return err;
3492 }
3493
3494 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3495 {
3496         kunmap_atomic(token, km);
3497 }
3498
3499 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3500                           unsigned long start,
3501                           unsigned long len)
3502 {
3503         size_t cur;
3504         size_t offset;
3505         struct page *page;
3506         char *kaddr;
3507         char *ptr = (char *)ptrv;
3508         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3509         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3510         int ret = 0;
3511
3512         WARN_ON(start > eb->len);
3513         WARN_ON(start + len > eb->start + eb->len);
3514
3515         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3516
3517         while (len > 0) {
3518                 page = extent_buffer_page(eb, i);
3519
3520                 cur = min(len, (PAGE_CACHE_SIZE - offset));
3521
3522                 kaddr = kmap_atomic(page, KM_USER0);
3523                 ret = memcmp(ptr, kaddr + offset, cur);
3524                 kunmap_atomic(kaddr, KM_USER0);
3525                 if (ret)
3526                         break;
3527
3528                 ptr += cur;
3529                 len -= cur;
3530                 offset = 0;
3531                 i++;
3532         }
3533         return ret;
3534 }
3535
3536 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3537                          unsigned long start, unsigned long len)
3538 {
3539         size_t cur;
3540         size_t offset;
3541         struct page *page;
3542         char *kaddr;
3543         char *src = (char *)srcv;
3544         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3545         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3546
3547         WARN_ON(start > eb->len);
3548         WARN_ON(start + len > eb->start + eb->len);
3549
3550         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3551
3552         while (len > 0) {
3553                 page = extent_buffer_page(eb, i);
3554                 WARN_ON(!PageUptodate(page));
3555
3556                 cur = min(len, PAGE_CACHE_SIZE - offset);
3557                 kaddr = kmap_atomic(page, KM_USER1);
3558                 memcpy(kaddr + offset, src, cur);
3559                 kunmap_atomic(kaddr, KM_USER1);
3560
3561                 src += cur;
3562                 len -= cur;
3563                 offset = 0;
3564                 i++;
3565         }
3566 }
3567
3568 void memset_extent_buffer(struct extent_buffer *eb, char c,
3569                           unsigned long start, unsigned long len)
3570 {
3571         size_t cur;
3572         size_t offset;
3573         struct page *page;
3574         char *kaddr;
3575         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3576         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3577
3578         WARN_ON(start > eb->len);
3579         WARN_ON(start + len > eb->start + eb->len);
3580
3581         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3582
3583         while (len > 0) {
3584                 page = extent_buffer_page(eb, i);
3585                 WARN_ON(!PageUptodate(page));
3586
3587                 cur = min(len, PAGE_CACHE_SIZE - offset);
3588                 kaddr = kmap_atomic(page, KM_USER0);
3589                 memset(kaddr + offset, c, cur);
3590                 kunmap_atomic(kaddr, KM_USER0);
3591
3592                 len -= cur;
3593                 offset = 0;
3594                 i++;
3595         }
3596 }
3597
3598 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3599                         unsigned long dst_offset, unsigned long src_offset,
3600                         unsigned long len)
3601 {
3602         u64 dst_len = dst->len;
3603         size_t cur;
3604         size_t offset;
3605         struct page *page;
3606         char *kaddr;
3607         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3608         unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3609
3610         WARN_ON(src->len != dst_len);
3611
3612         offset = (start_offset + dst_offset) &
3613                 ((unsigned long)PAGE_CACHE_SIZE - 1);
3614
3615         while (len > 0) {
3616                 page = extent_buffer_page(dst, i);
3617                 WARN_ON(!PageUptodate(page));
3618
3619                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3620
3621                 kaddr = kmap_atomic(page, KM_USER0);
3622                 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3623                 kunmap_atomic(kaddr, KM_USER0);
3624
3625                 src_offset += cur;
3626                 len -= cur;
3627                 offset = 0;
3628                 i++;
3629         }
3630 }
3631
3632 static void move_pages(struct page *dst_page, struct page *src_page,
3633                        unsigned long dst_off, unsigned long src_off,
3634                        unsigned long len)
3635 {
3636         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3637         if (dst_page == src_page) {
3638                 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3639         } else {
3640                 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3641                 char *p = dst_kaddr + dst_off + len;
3642                 char *s = src_kaddr + src_off + len;
3643
3644                 while (len--)
3645                         *--p = *--s;
3646
3647                 kunmap_atomic(src_kaddr, KM_USER1);
3648         }
3649         kunmap_atomic(dst_kaddr, KM_USER0);
3650 }
3651
3652 static void copy_pages(struct page *dst_page, struct page *src_page,
3653                        unsigned long dst_off, unsigned long src_off,
3654                        unsigned long len)
3655 {
3656         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3657         char *src_kaddr;
3658
3659         if (dst_page != src_page)
3660                 src_kaddr = kmap_atomic(src_page, KM_USER1);
3661         else
3662                 src_kaddr = dst_kaddr;
3663
3664         memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3665         kunmap_atomic(dst_kaddr, KM_USER0);
3666         if (dst_page != src_page)
3667                 kunmap_atomic(src_kaddr, KM_USER1);
3668 }
3669
3670 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3671                            unsigned long src_offset, unsigned long len)
3672 {
3673         size_t cur;
3674         size_t dst_off_in_page;
3675         size_t src_off_in_page;
3676         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3677         unsigned long dst_i;
3678         unsigned long src_i;
3679
3680         if (src_offset + len > dst->len) {
3681                 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3682                        "len %lu dst len %lu\n", src_offset, len, dst->len);
3683                 BUG_ON(1);
3684         }
3685         if (dst_offset + len > dst->len) {
3686                 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3687                        "len %lu dst len %lu\n", dst_offset, len, dst->len);
3688                 BUG_ON(1);
3689         }
3690
3691         while (len > 0) {
3692                 dst_off_in_page = (start_offset + dst_offset) &
3693                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3694                 src_off_in_page = (start_offset + src_offset) &
3695                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3696
3697                 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3698                 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3699
3700                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3701                                                src_off_in_page));
3702                 cur = min_t(unsigned long, cur,
3703                         (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3704
3705                 copy_pages(extent_buffer_page(dst, dst_i),
3706                            extent_buffer_page(dst, src_i),
3707                            dst_off_in_page, src_off_in_page, cur);
3708
3709                 src_offset += cur;
3710                 dst_offset += cur;
3711                 len -= cur;
3712         }
3713 }
3714
3715 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3716                            unsigned long src_offset, unsigned long len)
3717 {
3718         size_t cur;
3719         size_t dst_off_in_page;
3720         size_t src_off_in_page;
3721         unsigned long dst_end = dst_offset + len - 1;
3722         unsigned long src_end = src_offset + len - 1;
3723         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3724         unsigned long dst_i;
3725         unsigned long src_i;
3726
3727         if (src_offset + len > dst->len) {
3728                 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3729                        "len %lu len %lu\n", src_offset, len, dst->len);
3730                 BUG_ON(1);
3731         }
3732         if (dst_offset + len > dst->len) {
3733                 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3734                        "len %lu len %lu\n", dst_offset, len, dst->len);
3735                 BUG_ON(1);
3736         }
3737         if (dst_offset < src_offset) {
3738                 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3739                 return;
3740         }
3741         while (len > 0) {
3742                 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3743                 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3744
3745                 dst_off_in_page = (start_offset + dst_end) &
3746                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3747                 src_off_in_page = (start_offset + src_end) &
3748                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3749
3750                 cur = min_t(unsigned long, len, src_off_in_page + 1);
3751                 cur = min(cur, dst_off_in_page + 1);
3752                 move_pages(extent_buffer_page(dst, dst_i),
3753                            extent_buffer_page(dst, src_i),
3754                            dst_off_in_page - cur + 1,
3755                            src_off_in_page - cur + 1, cur);
3756
3757                 dst_end -= cur;
3758                 src_end -= cur;
3759                 len -= cur;
3760         }
3761 }
3762
3763 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3764 {
3765         u64 start = page_offset(page);
3766         struct extent_buffer *eb;
3767         int ret = 1;
3768         unsigned long i;
3769         unsigned long num_pages;
3770
3771         spin_lock(&tree->buffer_lock);
3772         eb = buffer_search(tree, start);
3773         if (!eb)
3774                 goto out;
3775
3776         if (atomic_read(&eb->refs) > 1) {
3777                 ret = 0;
3778                 goto out;
3779         }
3780         if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3781                 ret = 0;
3782                 goto out;
3783         }
3784         /* at this point we can safely release the extent buffer */
3785         num_pages = num_extent_pages(eb->start, eb->len);
3786         for (i = 0; i < num_pages; i++)
3787                 page_cache_release(extent_buffer_page(eb, i));
3788         rb_erase(&eb->rb_node, &tree->buffer);
3789         __free_extent_buffer(eb);
3790 out:
3791         spin_unlock(&tree->buffer_lock);
3792         return ret;
3793 }