Btrfs: Fix failure cleanups when allocating extent buffers fail
[safe/jmp/linux-2.6] / fs / btrfs / extent_map.c
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/gfp.h>
6 #include <linux/pagemap.h>
7 #include <linux/page-flags.h>
8 #include <linux/module.h>
9 #include <linux/spinlock.h>
10 #include <linux/blkdev.h>
11 #include <linux/swap.h>
12 #include <linux/version.h>
13 #include <linux/writeback.h>
14 #include "extent_map.h"
15
16 /* temporary define until extent_map moves out of btrfs */
17 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
18                                        unsigned long extra_flags,
19                                        void (*ctor)(void *, struct kmem_cache *,
20                                                     unsigned long));
21
22 static struct kmem_cache *extent_map_cache;
23 static struct kmem_cache *extent_state_cache;
24 static struct kmem_cache *extent_buffer_cache;
25
26 static LIST_HEAD(buffers);
27 static LIST_HEAD(states);
28
29 static spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
30 #define BUFFER_LRU_MAX 64
31
32 struct tree_entry {
33         u64 start;
34         u64 end;
35         int in_tree;
36         struct rb_node rb_node;
37 };
38
39 struct extent_page_data {
40         struct bio *bio;
41         struct extent_map_tree *tree;
42         get_extent_t *get_extent;
43 };
44
45 void __init extent_map_init(void)
46 {
47         extent_map_cache = btrfs_cache_create("extent_map",
48                                             sizeof(struct extent_map), 0,
49                                             NULL);
50         extent_state_cache = btrfs_cache_create("extent_state",
51                                             sizeof(struct extent_state), 0,
52                                             NULL);
53         extent_buffer_cache = btrfs_cache_create("extent_buffers",
54                                             sizeof(struct extent_buffer), 0,
55                                             NULL);
56 }
57
58 void __exit extent_map_exit(void)
59 {
60         struct extent_state *state;
61
62         while (!list_empty(&states)) {
63                 state = list_entry(states.next, struct extent_state, list);
64                 printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs));
65                 list_del(&state->list);
66                 kmem_cache_free(extent_state_cache, state);
67
68         }
69
70         if (extent_map_cache)
71                 kmem_cache_destroy(extent_map_cache);
72         if (extent_state_cache)
73                 kmem_cache_destroy(extent_state_cache);
74         if (extent_buffer_cache)
75                 kmem_cache_destroy(extent_buffer_cache);
76 }
77
78 void extent_map_tree_init(struct extent_map_tree *tree,
79                           struct address_space *mapping, gfp_t mask)
80 {
81         tree->map.rb_node = NULL;
82         tree->state.rb_node = NULL;
83         tree->ops = NULL;
84         rwlock_init(&tree->lock);
85         spin_lock_init(&tree->lru_lock);
86         tree->mapping = mapping;
87         INIT_LIST_HEAD(&tree->buffer_lru);
88         tree->lru_size = 0;
89 }
90 EXPORT_SYMBOL(extent_map_tree_init);
91
92 void extent_map_tree_empty_lru(struct extent_map_tree *tree)
93 {
94         struct extent_buffer *eb;
95         while(!list_empty(&tree->buffer_lru)) {
96                 eb = list_entry(tree->buffer_lru.next, struct extent_buffer,
97                                 lru);
98                 list_del(&eb->lru);
99                 free_extent_buffer(eb);
100         }
101 }
102 EXPORT_SYMBOL(extent_map_tree_empty_lru);
103
104 struct extent_map *alloc_extent_map(gfp_t mask)
105 {
106         struct extent_map *em;
107         em = kmem_cache_alloc(extent_map_cache, mask);
108         if (!em || IS_ERR(em))
109                 return em;
110         em->in_tree = 0;
111         atomic_set(&em->refs, 1);
112         return em;
113 }
114 EXPORT_SYMBOL(alloc_extent_map);
115
116 void free_extent_map(struct extent_map *em)
117 {
118         if (!em)
119                 return;
120         if (atomic_dec_and_test(&em->refs)) {
121                 WARN_ON(em->in_tree);
122                 kmem_cache_free(extent_map_cache, em);
123         }
124 }
125 EXPORT_SYMBOL(free_extent_map);
126
127
128 struct extent_state *alloc_extent_state(gfp_t mask)
129 {
130         struct extent_state *state;
131         unsigned long flags;
132
133         state = kmem_cache_alloc(extent_state_cache, mask);
134         if (!state || IS_ERR(state))
135                 return state;
136         state->state = 0;
137         state->in_tree = 0;
138         state->private = 0;
139
140         spin_lock_irqsave(&state_lock, flags);
141         list_add(&state->list, &states);
142         spin_unlock_irqrestore(&state_lock, flags);
143
144         atomic_set(&state->refs, 1);
145         init_waitqueue_head(&state->wq);
146         return state;
147 }
148 EXPORT_SYMBOL(alloc_extent_state);
149
150 void free_extent_state(struct extent_state *state)
151 {
152         unsigned long flags;
153         if (!state)
154                 return;
155         if (atomic_dec_and_test(&state->refs)) {
156                 WARN_ON(state->in_tree);
157                 spin_lock_irqsave(&state_lock, flags);
158                 list_del(&state->list);
159                 spin_unlock_irqrestore(&state_lock, flags);
160                 kmem_cache_free(extent_state_cache, state);
161         }
162 }
163 EXPORT_SYMBOL(free_extent_state);
164
165 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
166                                    struct rb_node *node)
167 {
168         struct rb_node ** p = &root->rb_node;
169         struct rb_node * parent = NULL;
170         struct tree_entry *entry;
171
172         while(*p) {
173                 parent = *p;
174                 entry = rb_entry(parent, struct tree_entry, rb_node);
175
176                 if (offset < entry->start)
177                         p = &(*p)->rb_left;
178                 else if (offset > entry->end)
179                         p = &(*p)->rb_right;
180                 else
181                         return parent;
182         }
183
184         entry = rb_entry(node, struct tree_entry, rb_node);
185         entry->in_tree = 1;
186         rb_link_node(node, parent, p);
187         rb_insert_color(node, root);
188         return NULL;
189 }
190
191 static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
192                                    struct rb_node **prev_ret)
193 {
194         struct rb_node * n = root->rb_node;
195         struct rb_node *prev = NULL;
196         struct tree_entry *entry;
197         struct tree_entry *prev_entry = NULL;
198
199         while(n) {
200                 entry = rb_entry(n, struct tree_entry, rb_node);
201                 prev = n;
202                 prev_entry = entry;
203
204                 if (offset < entry->start)
205                         n = n->rb_left;
206                 else if (offset > entry->end)
207                         n = n->rb_right;
208                 else
209                         return n;
210         }
211         if (!prev_ret)
212                 return NULL;
213         while(prev && offset > prev_entry->end) {
214                 prev = rb_next(prev);
215                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
216         }
217         *prev_ret = prev;
218         return NULL;
219 }
220
221 static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
222 {
223         struct rb_node *prev;
224         struct rb_node *ret;
225         ret = __tree_search(root, offset, &prev);
226         if (!ret)
227                 return prev;
228         return ret;
229 }
230
231 static int tree_delete(struct rb_root *root, u64 offset)
232 {
233         struct rb_node *node;
234         struct tree_entry *entry;
235
236         node = __tree_search(root, offset, NULL);
237         if (!node)
238                 return -ENOENT;
239         entry = rb_entry(node, struct tree_entry, rb_node);
240         entry->in_tree = 0;
241         rb_erase(node, root);
242         return 0;
243 }
244
245 /*
246  * add_extent_mapping tries a simple backward merge with existing
247  * mappings.  The extent_map struct passed in will be inserted into
248  * the tree directly (no copies made, just a reference taken).
249  */
250 int add_extent_mapping(struct extent_map_tree *tree,
251                        struct extent_map *em)
252 {
253         int ret = 0;
254         struct extent_map *prev = NULL;
255         struct rb_node *rb;
256
257         write_lock_irq(&tree->lock);
258         rb = tree_insert(&tree->map, em->end, &em->rb_node);
259         if (rb) {
260                 prev = rb_entry(rb, struct extent_map, rb_node);
261                 printk("found extent map %Lu %Lu on insert of %Lu %Lu\n", prev->start, prev->end, em->start, em->end);
262                 ret = -EEXIST;
263                 goto out;
264         }
265         atomic_inc(&em->refs);
266         if (em->start != 0) {
267                 rb = rb_prev(&em->rb_node);
268                 if (rb)
269                         prev = rb_entry(rb, struct extent_map, rb_node);
270                 if (prev && prev->end + 1 == em->start &&
271                     ((em->block_start == EXTENT_MAP_HOLE &&
272                       prev->block_start == EXTENT_MAP_HOLE) ||
273                      (em->block_start == EXTENT_MAP_INLINE &&
274                       prev->block_start == EXTENT_MAP_INLINE) ||
275                      (em->block_start == EXTENT_MAP_DELALLOC &&
276                       prev->block_start == EXTENT_MAP_DELALLOC) ||
277                      (em->block_start < EXTENT_MAP_DELALLOC - 1 &&
278                       em->block_start == prev->block_end + 1))) {
279                         em->start = prev->start;
280                         em->block_start = prev->block_start;
281                         rb_erase(&prev->rb_node, &tree->map);
282                         prev->in_tree = 0;
283                         free_extent_map(prev);
284                 }
285          }
286 out:
287         write_unlock_irq(&tree->lock);
288         return ret;
289 }
290 EXPORT_SYMBOL(add_extent_mapping);
291
292 /*
293  * lookup_extent_mapping returns the first extent_map struct in the
294  * tree that intersects the [start, end] (inclusive) range.  There may
295  * be additional objects in the tree that intersect, so check the object
296  * returned carefully to make sure you don't need additional lookups.
297  */
298 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
299                                          u64 start, u64 end)
300 {
301         struct extent_map *em;
302         struct rb_node *rb_node;
303
304         read_lock_irq(&tree->lock);
305         rb_node = tree_search(&tree->map, start);
306         if (!rb_node) {
307                 em = NULL;
308                 goto out;
309         }
310         if (IS_ERR(rb_node)) {
311                 em = ERR_PTR(PTR_ERR(rb_node));
312                 goto out;
313         }
314         em = rb_entry(rb_node, struct extent_map, rb_node);
315         if (em->end < start || em->start > end) {
316                 em = NULL;
317                 goto out;
318         }
319         atomic_inc(&em->refs);
320 out:
321         read_unlock_irq(&tree->lock);
322         return em;
323 }
324 EXPORT_SYMBOL(lookup_extent_mapping);
325
326 /*
327  * removes an extent_map struct from the tree.  No reference counts are
328  * dropped, and no checks are done to  see if the range is in use
329  */
330 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
331 {
332         int ret;
333
334         write_lock_irq(&tree->lock);
335         ret = tree_delete(&tree->map, em->end);
336         write_unlock_irq(&tree->lock);
337         return ret;
338 }
339 EXPORT_SYMBOL(remove_extent_mapping);
340
341 /*
342  * utility function to look for merge candidates inside a given range.
343  * Any extents with matching state are merged together into a single
344  * extent in the tree.  Extents with EXTENT_IO in their state field
345  * are not merged because the end_io handlers need to be able to do
346  * operations on them without sleeping (or doing allocations/splits).
347  *
348  * This should be called with the tree lock held.
349  */
350 static int merge_state(struct extent_map_tree *tree,
351                        struct extent_state *state)
352 {
353         struct extent_state *other;
354         struct rb_node *other_node;
355
356         if (state->state & EXTENT_IOBITS)
357                 return 0;
358
359         other_node = rb_prev(&state->rb_node);
360         if (other_node) {
361                 other = rb_entry(other_node, struct extent_state, rb_node);
362                 if (other->end == state->start - 1 &&
363                     other->state == state->state) {
364                         state->start = other->start;
365                         other->in_tree = 0;
366                         rb_erase(&other->rb_node, &tree->state);
367                         free_extent_state(other);
368                 }
369         }
370         other_node = rb_next(&state->rb_node);
371         if (other_node) {
372                 other = rb_entry(other_node, struct extent_state, rb_node);
373                 if (other->start == state->end + 1 &&
374                     other->state == state->state) {
375                         other->start = state->start;
376                         state->in_tree = 0;
377                         rb_erase(&state->rb_node, &tree->state);
378                         free_extent_state(state);
379                 }
380         }
381         return 0;
382 }
383
384 /*
385  * insert an extent_state struct into the tree.  'bits' are set on the
386  * struct before it is inserted.
387  *
388  * This may return -EEXIST if the extent is already there, in which case the
389  * state struct is freed.
390  *
391  * The tree lock is not taken internally.  This is a utility function and
392  * probably isn't what you want to call (see set/clear_extent_bit).
393  */
394 static int insert_state(struct extent_map_tree *tree,
395                         struct extent_state *state, u64 start, u64 end,
396                         int bits)
397 {
398         struct rb_node *node;
399
400         if (end < start) {
401                 printk("end < start %Lu %Lu\n", end, start);
402                 WARN_ON(1);
403         }
404         state->state |= bits;
405         state->start = start;
406         state->end = end;
407         node = tree_insert(&tree->state, end, &state->rb_node);
408         if (node) {
409                 struct extent_state *found;
410                 found = rb_entry(node, struct extent_state, rb_node);
411                 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
412                 free_extent_state(state);
413                 return -EEXIST;
414         }
415         merge_state(tree, state);
416         return 0;
417 }
418
419 /*
420  * split a given extent state struct in two, inserting the preallocated
421  * struct 'prealloc' as the newly created second half.  'split' indicates an
422  * offset inside 'orig' where it should be split.
423  *
424  * Before calling,
425  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
426  * are two extent state structs in the tree:
427  * prealloc: [orig->start, split - 1]
428  * orig: [ split, orig->end ]
429  *
430  * The tree locks are not taken by this function. They need to be held
431  * by the caller.
432  */
433 static int split_state(struct extent_map_tree *tree, struct extent_state *orig,
434                        struct extent_state *prealloc, u64 split)
435 {
436         struct rb_node *node;
437         prealloc->start = orig->start;
438         prealloc->end = split - 1;
439         prealloc->state = orig->state;
440         orig->start = split;
441
442         node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
443         if (node) {
444                 struct extent_state *found;
445                 found = rb_entry(node, struct extent_state, rb_node);
446                 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
447                 free_extent_state(prealloc);
448                 return -EEXIST;
449         }
450         return 0;
451 }
452
453 /*
454  * utility function to clear some bits in an extent state struct.
455  * it will optionally wake up any one waiting on this state (wake == 1), or
456  * forcibly remove the state from the tree (delete == 1).
457  *
458  * If no bits are set on the state struct after clearing things, the
459  * struct is freed and removed from the tree
460  */
461 static int clear_state_bit(struct extent_map_tree *tree,
462                             struct extent_state *state, int bits, int wake,
463                             int delete)
464 {
465         int ret = state->state & bits;
466         state->state &= ~bits;
467         if (wake)
468                 wake_up(&state->wq);
469         if (delete || state->state == 0) {
470                 if (state->in_tree) {
471                         rb_erase(&state->rb_node, &tree->state);
472                         state->in_tree = 0;
473                         free_extent_state(state);
474                 } else {
475                         WARN_ON(1);
476                 }
477         } else {
478                 merge_state(tree, state);
479         }
480         return ret;
481 }
482
483 /*
484  * clear some bits on a range in the tree.  This may require splitting
485  * or inserting elements in the tree, so the gfp mask is used to
486  * indicate which allocations or sleeping are allowed.
487  *
488  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
489  * the given range from the tree regardless of state (ie for truncate).
490  *
491  * the range [start, end] is inclusive.
492  *
493  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
494  * bits were already set, or zero if none of the bits were already set.
495  */
496 int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end,
497                      int bits, int wake, int delete, gfp_t mask)
498 {
499         struct extent_state *state;
500         struct extent_state *prealloc = NULL;
501         struct rb_node *node;
502         unsigned long flags;
503         int err;
504         int set = 0;
505
506 again:
507         if (!prealloc && (mask & __GFP_WAIT)) {
508                 prealloc = alloc_extent_state(mask);
509                 if (!prealloc)
510                         return -ENOMEM;
511         }
512
513         write_lock_irqsave(&tree->lock, flags);
514         /*
515          * this search will find the extents that end after
516          * our range starts
517          */
518         node = tree_search(&tree->state, start);
519         if (!node)
520                 goto out;
521         state = rb_entry(node, struct extent_state, rb_node);
522         if (state->start > end)
523                 goto out;
524         WARN_ON(state->end < start);
525
526         /*
527          *     | ---- desired range ---- |
528          *  | state | or
529          *  | ------------- state -------------- |
530          *
531          * We need to split the extent we found, and may flip
532          * bits on second half.
533          *
534          * If the extent we found extends past our range, we
535          * just split and search again.  It'll get split again
536          * the next time though.
537          *
538          * If the extent we found is inside our range, we clear
539          * the desired bit on it.
540          */
541
542         if (state->start < start) {
543                 err = split_state(tree, state, prealloc, start);
544                 BUG_ON(err == -EEXIST);
545                 prealloc = NULL;
546                 if (err)
547                         goto out;
548                 if (state->end <= end) {
549                         start = state->end + 1;
550                         set |= clear_state_bit(tree, state, bits,
551                                         wake, delete);
552                 } else {
553                         start = state->start;
554                 }
555                 goto search_again;
556         }
557         /*
558          * | ---- desired range ---- |
559          *                        | state |
560          * We need to split the extent, and clear the bit
561          * on the first half
562          */
563         if (state->start <= end && state->end > end) {
564                 err = split_state(tree, state, prealloc, end + 1);
565                 BUG_ON(err == -EEXIST);
566
567                 if (wake)
568                         wake_up(&state->wq);
569                 set |= clear_state_bit(tree, prealloc, bits,
570                                        wake, delete);
571                 prealloc = NULL;
572                 goto out;
573         }
574
575         start = state->end + 1;
576         set |= clear_state_bit(tree, state, bits, wake, delete);
577         goto search_again;
578
579 out:
580         write_unlock_irqrestore(&tree->lock, flags);
581         if (prealloc)
582                 free_extent_state(prealloc);
583
584         return set;
585
586 search_again:
587         if (start > end)
588                 goto out;
589         write_unlock_irqrestore(&tree->lock, flags);
590         if (mask & __GFP_WAIT)
591                 cond_resched();
592         goto again;
593 }
594 EXPORT_SYMBOL(clear_extent_bit);
595
596 static int wait_on_state(struct extent_map_tree *tree,
597                          struct extent_state *state)
598 {
599         DEFINE_WAIT(wait);
600         prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
601         read_unlock_irq(&tree->lock);
602         schedule();
603         read_lock_irq(&tree->lock);
604         finish_wait(&state->wq, &wait);
605         return 0;
606 }
607
608 /*
609  * waits for one or more bits to clear on a range in the state tree.
610  * The range [start, end] is inclusive.
611  * The tree lock is taken by this function
612  */
613 int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits)
614 {
615         struct extent_state *state;
616         struct rb_node *node;
617
618         read_lock_irq(&tree->lock);
619 again:
620         while (1) {
621                 /*
622                  * this search will find all the extents that end after
623                  * our range starts
624                  */
625                 node = tree_search(&tree->state, start);
626                 if (!node)
627                         break;
628
629                 state = rb_entry(node, struct extent_state, rb_node);
630
631                 if (state->start > end)
632                         goto out;
633
634                 if (state->state & bits) {
635                         start = state->start;
636                         atomic_inc(&state->refs);
637                         wait_on_state(tree, state);
638                         free_extent_state(state);
639                         goto again;
640                 }
641                 start = state->end + 1;
642
643                 if (start > end)
644                         break;
645
646                 if (need_resched()) {
647                         read_unlock_irq(&tree->lock);
648                         cond_resched();
649                         read_lock_irq(&tree->lock);
650                 }
651         }
652 out:
653         read_unlock_irq(&tree->lock);
654         return 0;
655 }
656 EXPORT_SYMBOL(wait_extent_bit);
657
658 /*
659  * set some bits on a range in the tree.  This may require allocations
660  * or sleeping, so the gfp mask is used to indicate what is allowed.
661  *
662  * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
663  * range already has the desired bits set.  The start of the existing
664  * range is returned in failed_start in this case.
665  *
666  * [start, end] is inclusive
667  * This takes the tree lock.
668  */
669 int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits,
670                    int exclusive, u64 *failed_start, gfp_t mask)
671 {
672         struct extent_state *state;
673         struct extent_state *prealloc = NULL;
674         struct rb_node *node;
675         unsigned long flags;
676         int err = 0;
677         int set;
678         u64 last_start;
679         u64 last_end;
680 again:
681         if (!prealloc && (mask & __GFP_WAIT)) {
682                 prealloc = alloc_extent_state(mask);
683                 if (!prealloc)
684                         return -ENOMEM;
685         }
686
687         write_lock_irqsave(&tree->lock, flags);
688         /*
689          * this search will find all the extents that end after
690          * our range starts.
691          */
692         node = tree_search(&tree->state, start);
693         if (!node) {
694                 err = insert_state(tree, prealloc, start, end, bits);
695                 prealloc = NULL;
696                 BUG_ON(err == -EEXIST);
697                 goto out;
698         }
699
700         state = rb_entry(node, struct extent_state, rb_node);
701         last_start = state->start;
702         last_end = state->end;
703
704         /*
705          * | ---- desired range ---- |
706          * | state |
707          *
708          * Just lock what we found and keep going
709          */
710         if (state->start == start && state->end <= end) {
711                 set = state->state & bits;
712                 if (set && exclusive) {
713                         *failed_start = state->start;
714                         err = -EEXIST;
715                         goto out;
716                 }
717                 state->state |= bits;
718                 start = state->end + 1;
719                 merge_state(tree, state);
720                 goto search_again;
721         }
722
723         /*
724          *     | ---- desired range ---- |
725          * | state |
726          *   or
727          * | ------------- state -------------- |
728          *
729          * We need to split the extent we found, and may flip bits on
730          * second half.
731          *
732          * If the extent we found extends past our
733          * range, we just split and search again.  It'll get split
734          * again the next time though.
735          *
736          * If the extent we found is inside our range, we set the
737          * desired bit on it.
738          */
739         if (state->start < start) {
740                 set = state->state & bits;
741                 if (exclusive && set) {
742                         *failed_start = start;
743                         err = -EEXIST;
744                         goto out;
745                 }
746                 err = split_state(tree, state, prealloc, start);
747                 BUG_ON(err == -EEXIST);
748                 prealloc = NULL;
749                 if (err)
750                         goto out;
751                 if (state->end <= end) {
752                         state->state |= bits;
753                         start = state->end + 1;
754                         merge_state(tree, state);
755                 } else {
756                         start = state->start;
757                 }
758                 goto search_again;
759         }
760         /*
761          * | ---- desired range ---- |
762          *     | state | or               | state |
763          *
764          * There's a hole, we need to insert something in it and
765          * ignore the extent we found.
766          */
767         if (state->start > start) {
768                 u64 this_end;
769                 if (end < last_start)
770                         this_end = end;
771                 else
772                         this_end = last_start -1;
773                 err = insert_state(tree, prealloc, start, this_end,
774                                    bits);
775                 prealloc = NULL;
776                 BUG_ON(err == -EEXIST);
777                 if (err)
778                         goto out;
779                 start = this_end + 1;
780                 goto search_again;
781         }
782         /*
783          * | ---- desired range ---- |
784          *                        | state |
785          * We need to split the extent, and set the bit
786          * on the first half
787          */
788         if (state->start <= end && state->end > end) {
789                 set = state->state & bits;
790                 if (exclusive && set) {
791                         *failed_start = start;
792                         err = -EEXIST;
793                         goto out;
794                 }
795                 err = split_state(tree, state, prealloc, end + 1);
796                 BUG_ON(err == -EEXIST);
797
798                 prealloc->state |= bits;
799                 merge_state(tree, prealloc);
800                 prealloc = NULL;
801                 goto out;
802         }
803
804         goto search_again;
805
806 out:
807         write_unlock_irqrestore(&tree->lock, flags);
808         if (prealloc)
809                 free_extent_state(prealloc);
810
811         return err;
812
813 search_again:
814         if (start > end)
815                 goto out;
816         write_unlock_irqrestore(&tree->lock, flags);
817         if (mask & __GFP_WAIT)
818                 cond_resched();
819         goto again;
820 }
821 EXPORT_SYMBOL(set_extent_bit);
822
823 /* wrappers around set/clear extent bit */
824 int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
825                      gfp_t mask)
826 {
827         return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
828                               mask);
829 }
830 EXPORT_SYMBOL(set_extent_dirty);
831
832 int set_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
833                     int bits, gfp_t mask)
834 {
835         return set_extent_bit(tree, start, end, bits, 0, NULL,
836                               mask);
837 }
838 EXPORT_SYMBOL(set_extent_bits);
839
840 int clear_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
841                       int bits, gfp_t mask)
842 {
843         return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
844 }
845 EXPORT_SYMBOL(clear_extent_bits);
846
847 int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end,
848                      gfp_t mask)
849 {
850         return set_extent_bit(tree, start, end,
851                               EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
852                               mask);
853 }
854 EXPORT_SYMBOL(set_extent_delalloc);
855
856 int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
857                        gfp_t mask)
858 {
859         return clear_extent_bit(tree, start, end,
860                                 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
861 }
862 EXPORT_SYMBOL(clear_extent_dirty);
863
864 int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
865                      gfp_t mask)
866 {
867         return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
868                               mask);
869 }
870 EXPORT_SYMBOL(set_extent_new);
871
872 int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
873                        gfp_t mask)
874 {
875         return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
876 }
877 EXPORT_SYMBOL(clear_extent_new);
878
879 int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
880                         gfp_t mask)
881 {
882         return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
883                               mask);
884 }
885 EXPORT_SYMBOL(set_extent_uptodate);
886
887 int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
888                           gfp_t mask)
889 {
890         return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
891 }
892 EXPORT_SYMBOL(clear_extent_uptodate);
893
894 int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
895                          gfp_t mask)
896 {
897         return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
898                               0, NULL, mask);
899 }
900 EXPORT_SYMBOL(set_extent_writeback);
901
902 int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
903                            gfp_t mask)
904 {
905         return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
906 }
907 EXPORT_SYMBOL(clear_extent_writeback);
908
909 int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end)
910 {
911         return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
912 }
913 EXPORT_SYMBOL(wait_on_extent_writeback);
914
915 /*
916  * locks a range in ascending order, waiting for any locked regions
917  * it hits on the way.  [start,end] are inclusive, and this will sleep.
918  */
919 int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask)
920 {
921         int err;
922         u64 failed_start;
923         while (1) {
924                 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
925                                      &failed_start, mask);
926                 if (err == -EEXIST && (mask & __GFP_WAIT)) {
927                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
928                         start = failed_start;
929                 } else {
930                         break;
931                 }
932                 WARN_ON(start > end);
933         }
934         return err;
935 }
936 EXPORT_SYMBOL(lock_extent);
937
938 int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end,
939                   gfp_t mask)
940 {
941         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
942 }
943 EXPORT_SYMBOL(unlock_extent);
944
945 /*
946  * helper function to set pages and extents in the tree dirty
947  */
948 int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end)
949 {
950         unsigned long index = start >> PAGE_CACHE_SHIFT;
951         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
952         struct page *page;
953
954         while (index <= end_index) {
955                 page = find_get_page(tree->mapping, index);
956                 BUG_ON(!page);
957                 __set_page_dirty_nobuffers(page);
958                 page_cache_release(page);
959                 index++;
960         }
961         set_extent_dirty(tree, start, end, GFP_NOFS);
962         return 0;
963 }
964 EXPORT_SYMBOL(set_range_dirty);
965
966 /*
967  * helper function to set both pages and extents in the tree writeback
968  */
969 int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end)
970 {
971         unsigned long index = start >> PAGE_CACHE_SHIFT;
972         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
973         struct page *page;
974
975         while (index <= end_index) {
976                 page = find_get_page(tree->mapping, index);
977                 BUG_ON(!page);
978                 set_page_writeback(page);
979                 page_cache_release(page);
980                 index++;
981         }
982         set_extent_writeback(tree, start, end, GFP_NOFS);
983         return 0;
984 }
985 EXPORT_SYMBOL(set_range_writeback);
986
987 int find_first_extent_bit(struct extent_map_tree *tree, u64 start,
988                           u64 *start_ret, u64 *end_ret, int bits)
989 {
990         struct rb_node *node;
991         struct extent_state *state;
992         int ret = 1;
993
994         read_lock_irq(&tree->lock);
995         /*
996          * this search will find all the extents that end after
997          * our range starts.
998          */
999         node = tree_search(&tree->state, start);
1000         if (!node || IS_ERR(node)) {
1001                 goto out;
1002         }
1003
1004         while(1) {
1005                 state = rb_entry(node, struct extent_state, rb_node);
1006                 if (state->end >= start && (state->state & bits)) {
1007                         *start_ret = state->start;
1008                         *end_ret = state->end;
1009                         ret = 0;
1010                         break;
1011                 }
1012                 node = rb_next(node);
1013                 if (!node)
1014                         break;
1015         }
1016 out:
1017         read_unlock_irq(&tree->lock);
1018         return ret;
1019 }
1020 EXPORT_SYMBOL(find_first_extent_bit);
1021
1022 u64 find_lock_delalloc_range(struct extent_map_tree *tree,
1023                              u64 start, u64 lock_start, u64 *end, u64 max_bytes)
1024 {
1025         struct rb_node *node;
1026         struct extent_state *state;
1027         u64 cur_start = start;
1028         u64 found = 0;
1029         u64 total_bytes = 0;
1030
1031         write_lock_irq(&tree->lock);
1032         /*
1033          * this search will find all the extents that end after
1034          * our range starts.
1035          */
1036 search_again:
1037         node = tree_search(&tree->state, cur_start);
1038         if (!node || IS_ERR(node)) {
1039                 goto out;
1040         }
1041
1042         while(1) {
1043                 state = rb_entry(node, struct extent_state, rb_node);
1044                 if (state->start != cur_start) {
1045                         goto out;
1046                 }
1047                 if (!(state->state & EXTENT_DELALLOC)) {
1048                         goto out;
1049                 }
1050                 if (state->start >= lock_start) {
1051                         if (state->state & EXTENT_LOCKED) {
1052                                 DEFINE_WAIT(wait);
1053                                 atomic_inc(&state->refs);
1054                                 prepare_to_wait(&state->wq, &wait,
1055                                                 TASK_UNINTERRUPTIBLE);
1056                                 write_unlock_irq(&tree->lock);
1057                                 schedule();
1058                                 write_lock_irq(&tree->lock);
1059                                 finish_wait(&state->wq, &wait);
1060                                 free_extent_state(state);
1061                                 goto search_again;
1062                         }
1063                         state->state |= EXTENT_LOCKED;
1064                 }
1065                 found++;
1066                 *end = state->end;
1067                 cur_start = state->end + 1;
1068                 node = rb_next(node);
1069                 if (!node)
1070                         break;
1071                 total_bytes += state->end - state->start + 1;
1072                 if (total_bytes >= max_bytes)
1073                         break;
1074         }
1075 out:
1076         write_unlock_irq(&tree->lock);
1077         return found;
1078 }
1079
1080 /*
1081  * helper function to lock both pages and extents in the tree.
1082  * pages must be locked first.
1083  */
1084 int lock_range(struct extent_map_tree *tree, u64 start, u64 end)
1085 {
1086         unsigned long index = start >> PAGE_CACHE_SHIFT;
1087         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1088         struct page *page;
1089         int err;
1090
1091         while (index <= end_index) {
1092                 page = grab_cache_page(tree->mapping, index);
1093                 if (!page) {
1094                         err = -ENOMEM;
1095                         goto failed;
1096                 }
1097                 if (IS_ERR(page)) {
1098                         err = PTR_ERR(page);
1099                         goto failed;
1100                 }
1101                 index++;
1102         }
1103         lock_extent(tree, start, end, GFP_NOFS);
1104         return 0;
1105
1106 failed:
1107         /*
1108          * we failed above in getting the page at 'index', so we undo here
1109          * up to but not including the page at 'index'
1110          */
1111         end_index = index;
1112         index = start >> PAGE_CACHE_SHIFT;
1113         while (index < end_index) {
1114                 page = find_get_page(tree->mapping, index);
1115                 unlock_page(page);
1116                 page_cache_release(page);
1117                 index++;
1118         }
1119         return err;
1120 }
1121 EXPORT_SYMBOL(lock_range);
1122
1123 /*
1124  * helper function to unlock both pages and extents in the tree.
1125  */
1126 int unlock_range(struct extent_map_tree *tree, u64 start, u64 end)
1127 {
1128         unsigned long index = start >> PAGE_CACHE_SHIFT;
1129         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1130         struct page *page;
1131
1132         while (index <= end_index) {
1133                 page = find_get_page(tree->mapping, index);
1134                 unlock_page(page);
1135                 page_cache_release(page);
1136                 index++;
1137         }
1138         unlock_extent(tree, start, end, GFP_NOFS);
1139         return 0;
1140 }
1141 EXPORT_SYMBOL(unlock_range);
1142
1143 int set_state_private(struct extent_map_tree *tree, u64 start, u64 private)
1144 {
1145         struct rb_node *node;
1146         struct extent_state *state;
1147         int ret = 0;
1148
1149         write_lock_irq(&tree->lock);
1150         /*
1151          * this search will find all the extents that end after
1152          * our range starts.
1153          */
1154         node = tree_search(&tree->state, start);
1155         if (!node || IS_ERR(node)) {
1156                 ret = -ENOENT;
1157                 goto out;
1158         }
1159         state = rb_entry(node, struct extent_state, rb_node);
1160         if (state->start != start) {
1161                 ret = -ENOENT;
1162                 goto out;
1163         }
1164         state->private = private;
1165 out:
1166         write_unlock_irq(&tree->lock);
1167         return ret;
1168 }
1169
1170 int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private)
1171 {
1172         struct rb_node *node;
1173         struct extent_state *state;
1174         int ret = 0;
1175
1176         read_lock_irq(&tree->lock);
1177         /*
1178          * this search will find all the extents that end after
1179          * our range starts.
1180          */
1181         node = tree_search(&tree->state, start);
1182         if (!node || IS_ERR(node)) {
1183                 ret = -ENOENT;
1184                 goto out;
1185         }
1186         state = rb_entry(node, struct extent_state, rb_node);
1187         if (state->start != start) {
1188                 ret = -ENOENT;
1189                 goto out;
1190         }
1191         *private = state->private;
1192 out:
1193         read_unlock_irq(&tree->lock);
1194         return ret;
1195 }
1196
1197 /*
1198  * searches a range in the state tree for a given mask.
1199  * If 'filled' == 1, this returns 1 only if ever extent in the tree
1200  * has the bits set.  Otherwise, 1 is returned if any bit in the
1201  * range is found set.
1202  */
1203 int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
1204                    int bits, int filled)
1205 {
1206         struct extent_state *state = NULL;
1207         struct rb_node *node;
1208         int bitset = 0;
1209
1210         read_lock_irq(&tree->lock);
1211         node = tree_search(&tree->state, start);
1212         while (node && start <= end) {
1213                 state = rb_entry(node, struct extent_state, rb_node);
1214                 if (state->start > end)
1215                         break;
1216
1217                 if (filled && state->start > start) {
1218                         bitset = 0;
1219                         break;
1220                 }
1221                 if (state->state & bits) {
1222                         bitset = 1;
1223                         if (!filled)
1224                                 break;
1225                 } else if (filled) {
1226                         bitset = 0;
1227                         break;
1228                 }
1229                 start = state->end + 1;
1230                 if (start > end)
1231                         break;
1232                 node = rb_next(node);
1233         }
1234         read_unlock_irq(&tree->lock);
1235         return bitset;
1236 }
1237 EXPORT_SYMBOL(test_range_bit);
1238
1239 /*
1240  * helper function to set a given page up to date if all the
1241  * extents in the tree for that page are up to date
1242  */
1243 static int check_page_uptodate(struct extent_map_tree *tree,
1244                                struct page *page)
1245 {
1246         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1247         u64 end = start + PAGE_CACHE_SIZE - 1;
1248         if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1249                 SetPageUptodate(page);
1250         return 0;
1251 }
1252
1253 /*
1254  * helper function to unlock a page if all the extents in the tree
1255  * for that page are unlocked
1256  */
1257 static int check_page_locked(struct extent_map_tree *tree,
1258                              struct page *page)
1259 {
1260         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1261         u64 end = start + PAGE_CACHE_SIZE - 1;
1262         if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1263                 unlock_page(page);
1264         return 0;
1265 }
1266
1267 /*
1268  * helper function to end page writeback if all the extents
1269  * in the tree for that page are done with writeback
1270  */
1271 static int check_page_writeback(struct extent_map_tree *tree,
1272                              struct page *page)
1273 {
1274         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1275         u64 end = start + PAGE_CACHE_SIZE - 1;
1276         if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1277                 end_page_writeback(page);
1278         return 0;
1279 }
1280
1281 /* lots and lots of room for performance fixes in the end_bio funcs */
1282
1283 /*
1284  * after a writepage IO is done, we need to:
1285  * clear the uptodate bits on error
1286  * clear the writeback bits in the extent tree for this IO
1287  * end_page_writeback if the page has no more pending IO
1288  *
1289  * Scheduling is not allowed, so the extent state tree is expected
1290  * to have one and only one object corresponding to this IO.
1291  */
1292 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1293 static void end_bio_extent_writepage(struct bio *bio, int err)
1294 #else
1295 static int end_bio_extent_writepage(struct bio *bio,
1296                                    unsigned int bytes_done, int err)
1297 #endif
1298 {
1299         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1300         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1301         struct extent_map_tree *tree = bio->bi_private;
1302         u64 start;
1303         u64 end;
1304         int whole_page;
1305
1306 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1307         if (bio->bi_size)
1308                 return 1;
1309 #endif
1310
1311         do {
1312                 struct page *page = bvec->bv_page;
1313                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1314                          bvec->bv_offset;
1315                 end = start + bvec->bv_len - 1;
1316
1317                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1318                         whole_page = 1;
1319                 else
1320                         whole_page = 0;
1321
1322                 if (--bvec >= bio->bi_io_vec)
1323                         prefetchw(&bvec->bv_page->flags);
1324
1325                 if (!uptodate) {
1326                         clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1327                         ClearPageUptodate(page);
1328                         SetPageError(page);
1329                 }
1330                 clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1331
1332                 if (whole_page)
1333                         end_page_writeback(page);
1334                 else
1335                         check_page_writeback(tree, page);
1336                 if (tree->ops && tree->ops->writepage_end_io_hook)
1337                         tree->ops->writepage_end_io_hook(page, start, end);
1338         } while (bvec >= bio->bi_io_vec);
1339
1340         bio_put(bio);
1341 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1342         return 0;
1343 #endif
1344 }
1345
1346 /*
1347  * after a readpage IO is done, we need to:
1348  * clear the uptodate bits on error
1349  * set the uptodate bits if things worked
1350  * set the page up to date if all extents in the tree are uptodate
1351  * clear the lock bit in the extent tree
1352  * unlock the page if there are no other extents locked for it
1353  *
1354  * Scheduling is not allowed, so the extent state tree is expected
1355  * to have one and only one object corresponding to this IO.
1356  */
1357 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1358 static void end_bio_extent_readpage(struct bio *bio, int err)
1359 #else
1360 static int end_bio_extent_readpage(struct bio *bio,
1361                                    unsigned int bytes_done, int err)
1362 #endif
1363 {
1364         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1365         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1366         struct extent_map_tree *tree = bio->bi_private;
1367         u64 start;
1368         u64 end;
1369         int whole_page;
1370         int ret;
1371
1372 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1373         if (bio->bi_size)
1374                 return 1;
1375 #endif
1376
1377         do {
1378                 struct page *page = bvec->bv_page;
1379                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1380                         bvec->bv_offset;
1381                 end = start + bvec->bv_len - 1;
1382
1383                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1384                         whole_page = 1;
1385                 else
1386                         whole_page = 0;
1387
1388                 if (--bvec >= bio->bi_io_vec)
1389                         prefetchw(&bvec->bv_page->flags);
1390
1391                 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1392                         ret = tree->ops->readpage_end_io_hook(page, start, end);
1393                         if (ret)
1394                                 uptodate = 0;
1395                 }
1396                 if (uptodate) {
1397                         set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1398                         if (whole_page)
1399                                 SetPageUptodate(page);
1400                         else
1401                                 check_page_uptodate(tree, page);
1402                 } else {
1403                         ClearPageUptodate(page);
1404                         SetPageError(page);
1405                 }
1406
1407                 unlock_extent(tree, start, end, GFP_ATOMIC);
1408
1409                 if (whole_page)
1410                         unlock_page(page);
1411                 else
1412                         check_page_locked(tree, page);
1413         } while (bvec >= bio->bi_io_vec);
1414
1415         bio_put(bio);
1416 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1417         return 0;
1418 #endif
1419 }
1420
1421 /*
1422  * IO done from prepare_write is pretty simple, we just unlock
1423  * the structs in the extent tree when done, and set the uptodate bits
1424  * as appropriate.
1425  */
1426 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1427 static void end_bio_extent_preparewrite(struct bio *bio, int err)
1428 #else
1429 static int end_bio_extent_preparewrite(struct bio *bio,
1430                                        unsigned int bytes_done, int err)
1431 #endif
1432 {
1433         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1434         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1435         struct extent_map_tree *tree = bio->bi_private;
1436         u64 start;
1437         u64 end;
1438
1439 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1440         if (bio->bi_size)
1441                 return 1;
1442 #endif
1443
1444         do {
1445                 struct page *page = bvec->bv_page;
1446                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1447                         bvec->bv_offset;
1448                 end = start + bvec->bv_len - 1;
1449
1450                 if (--bvec >= bio->bi_io_vec)
1451                         prefetchw(&bvec->bv_page->flags);
1452
1453                 if (uptodate) {
1454                         set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1455                 } else {
1456                         ClearPageUptodate(page);
1457                         SetPageError(page);
1458                 }
1459
1460                 unlock_extent(tree, start, end, GFP_ATOMIC);
1461
1462         } while (bvec >= bio->bi_io_vec);
1463
1464         bio_put(bio);
1465 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1466         return 0;
1467 #endif
1468 }
1469
1470 static struct bio *
1471 extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1472                  gfp_t gfp_flags)
1473 {
1474         struct bio *bio;
1475
1476         bio = bio_alloc(gfp_flags, nr_vecs);
1477
1478         if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1479                 while (!bio && (nr_vecs /= 2))
1480                         bio = bio_alloc(gfp_flags, nr_vecs);
1481         }
1482
1483         if (bio) {
1484                 bio->bi_bdev = bdev;
1485                 bio->bi_sector = first_sector;
1486         }
1487         return bio;
1488 }
1489
1490 static int submit_one_bio(int rw, struct bio *bio)
1491 {
1492         int ret = 0;
1493         bio_get(bio);
1494         submit_bio(rw, bio);
1495         if (bio_flagged(bio, BIO_EOPNOTSUPP))
1496                 ret = -EOPNOTSUPP;
1497         bio_put(bio);
1498         return ret;
1499 }
1500
1501 static int submit_extent_page(int rw, struct extent_map_tree *tree,
1502                               struct page *page, sector_t sector,
1503                               size_t size, unsigned long offset,
1504                               struct block_device *bdev,
1505                               struct bio **bio_ret,
1506                               int max_pages,
1507                               bio_end_io_t end_io_func)
1508 {
1509         int ret = 0;
1510         struct bio *bio;
1511         int nr;
1512
1513         if (bio_ret && *bio_ret) {
1514                 bio = *bio_ret;
1515                 if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
1516                     bio_add_page(bio, page, size, offset) < size) {
1517                         ret = submit_one_bio(rw, bio);
1518                         bio = NULL;
1519                 } else {
1520                         return 0;
1521                 }
1522         }
1523         nr = min(max_pages, bio_get_nr_vecs(bdev));
1524         bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1525         if (!bio) {
1526                 printk("failed to allocate bio nr %d\n", nr);
1527         }
1528         bio_add_page(bio, page, size, offset);
1529         bio->bi_end_io = end_io_func;
1530         bio->bi_private = tree;
1531         if (bio_ret) {
1532                 *bio_ret = bio;
1533         } else {
1534                 ret = submit_one_bio(rw, bio);
1535         }
1536
1537         return ret;
1538 }
1539
1540 void set_page_extent_mapped(struct page *page)
1541 {
1542         if (!PagePrivate(page)) {
1543                 SetPagePrivate(page);
1544                 WARN_ON(!page->mapping->a_ops->invalidatepage);
1545                 set_page_private(page, EXTENT_PAGE_PRIVATE);
1546                 page_cache_get(page);
1547         }
1548 }
1549
1550 /*
1551  * basic readpage implementation.  Locked extent state structs are inserted
1552  * into the tree that are removed when the IO is done (by the end_io
1553  * handlers)
1554  */
1555 int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
1556                           get_extent_t *get_extent)
1557 {
1558         struct inode *inode = page->mapping->host;
1559         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1560         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1561         u64 end;
1562         u64 cur = start;
1563         u64 extent_offset;
1564         u64 last_byte = i_size_read(inode);
1565         u64 block_start;
1566         u64 cur_end;
1567         sector_t sector;
1568         struct extent_map *em;
1569         struct block_device *bdev;
1570         int ret;
1571         int nr = 0;
1572         size_t page_offset = 0;
1573         size_t iosize;
1574         size_t blocksize = inode->i_sb->s_blocksize;
1575
1576         set_page_extent_mapped(page);
1577
1578         end = page_end;
1579         lock_extent(tree, start, end, GFP_NOFS);
1580
1581         while (cur <= end) {
1582                 if (cur >= last_byte) {
1583                         iosize = PAGE_CACHE_SIZE - page_offset;
1584                         zero_user_page(page, page_offset, iosize, KM_USER0);
1585                         set_extent_uptodate(tree, cur, cur + iosize - 1,
1586                                             GFP_NOFS);
1587                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1588                         break;
1589                 }
1590                 em = get_extent(inode, page, page_offset, cur, end, 0);
1591                 if (IS_ERR(em) || !em) {
1592                         SetPageError(page);
1593                         unlock_extent(tree, cur, end, GFP_NOFS);
1594                         break;
1595                 }
1596
1597                 extent_offset = cur - em->start;
1598                 BUG_ON(em->end < cur);
1599                 BUG_ON(end < cur);
1600
1601                 iosize = min(em->end - cur, end - cur) + 1;
1602                 cur_end = min(em->end, end);
1603                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1604                 sector = (em->block_start + extent_offset) >> 9;
1605                 bdev = em->bdev;
1606                 block_start = em->block_start;
1607                 free_extent_map(em);
1608                 em = NULL;
1609
1610                 /* we've found a hole, just zero and go on */
1611                 if (block_start == EXTENT_MAP_HOLE) {
1612                         zero_user_page(page, page_offset, iosize, KM_USER0);
1613                         set_extent_uptodate(tree, cur, cur + iosize - 1,
1614                                             GFP_NOFS);
1615                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1616                         cur = cur + iosize;
1617                         page_offset += iosize;
1618                         continue;
1619                 }
1620                 /* the get_extent function already copied into the page */
1621                 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
1622                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1623                         cur = cur + iosize;
1624                         page_offset += iosize;
1625                         continue;
1626                 }
1627
1628                 ret = 0;
1629                 if (tree->ops && tree->ops->readpage_io_hook) {
1630                         ret = tree->ops->readpage_io_hook(page, cur,
1631                                                           cur + iosize - 1);
1632                 }
1633                 if (!ret) {
1634                         ret = submit_extent_page(READ, tree, page,
1635                                                  sector, iosize, page_offset,
1636                                                  bdev, NULL, 1,
1637                                                  end_bio_extent_readpage);
1638                 }
1639                 if (ret)
1640                         SetPageError(page);
1641                 cur = cur + iosize;
1642                 page_offset += iosize;
1643                 nr++;
1644         }
1645         if (!nr) {
1646                 if (!PageError(page))
1647                         SetPageUptodate(page);
1648                 unlock_page(page);
1649         }
1650         return 0;
1651 }
1652 EXPORT_SYMBOL(extent_read_full_page);
1653
1654 /*
1655  * the writepage semantics are similar to regular writepage.  extent
1656  * records are inserted to lock ranges in the tree, and as dirty areas
1657  * are found, they are marked writeback.  Then the lock bits are removed
1658  * and the end_io handler clears the writeback ranges
1659  */
1660 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
1661                               void *data)
1662 {
1663         struct inode *inode = page->mapping->host;
1664         struct extent_page_data *epd = data;
1665         struct extent_map_tree *tree = epd->tree;
1666         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1667         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1668         u64 end;
1669         u64 cur = start;
1670         u64 extent_offset;
1671         u64 last_byte = i_size_read(inode);
1672         u64 block_start;
1673         u64 iosize;
1674         sector_t sector;
1675         struct extent_map *em;
1676         struct block_device *bdev;
1677         int ret;
1678         int nr = 0;
1679         size_t page_offset = 0;
1680         size_t blocksize;
1681         loff_t i_size = i_size_read(inode);
1682         unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
1683         u64 nr_delalloc;
1684         u64 delalloc_end;
1685
1686         WARN_ON(!PageLocked(page));
1687         if (page->index > end_index) {
1688                 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1689                 unlock_page(page);
1690                 return 0;
1691         }
1692
1693         if (page->index == end_index) {
1694                 size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
1695                 zero_user_page(page, offset,
1696                                PAGE_CACHE_SIZE - offset, KM_USER0);
1697         }
1698
1699         set_page_extent_mapped(page);
1700
1701         lock_extent(tree, start, page_end, GFP_NOFS);
1702         nr_delalloc = find_lock_delalloc_range(tree, start, page_end + 1,
1703                                                &delalloc_end,
1704                                                128 * 1024 * 1024);
1705         if (nr_delalloc) {
1706                 tree->ops->fill_delalloc(inode, start, delalloc_end);
1707                 if (delalloc_end >= page_end + 1) {
1708                         clear_extent_bit(tree, page_end + 1, delalloc_end,
1709                                          EXTENT_LOCKED | EXTENT_DELALLOC,
1710                                          1, 0, GFP_NOFS);
1711                 }
1712                 clear_extent_bit(tree, start, page_end, EXTENT_DELALLOC,
1713                                  0, 0, GFP_NOFS);
1714                 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1715                         printk("found delalloc bits after clear extent_bit\n");
1716                 }
1717         } else if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1718                 printk("found delalloc bits after find_delalloc_range returns 0\n");
1719         }
1720
1721         end = page_end;
1722         if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1723                 printk("found delalloc bits after lock_extent\n");
1724         }
1725
1726         if (last_byte <= start) {
1727                 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1728                 goto done;
1729         }
1730
1731         set_extent_uptodate(tree, start, page_end, GFP_NOFS);
1732         blocksize = inode->i_sb->s_blocksize;
1733
1734         while (cur <= end) {
1735                 if (cur >= last_byte) {
1736                         clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
1737                         break;
1738                 }
1739                 em = epd->get_extent(inode, page, page_offset, cur, end, 1);
1740                 if (IS_ERR(em) || !em) {
1741                         SetPageError(page);
1742                         break;
1743                 }
1744
1745                 extent_offset = cur - em->start;
1746                 BUG_ON(em->end < cur);
1747                 BUG_ON(end < cur);
1748                 iosize = min(em->end - cur, end - cur) + 1;
1749                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1750                 sector = (em->block_start + extent_offset) >> 9;
1751                 bdev = em->bdev;
1752                 block_start = em->block_start;
1753                 free_extent_map(em);
1754                 em = NULL;
1755
1756                 if (block_start == EXTENT_MAP_HOLE ||
1757                     block_start == EXTENT_MAP_INLINE) {
1758                         clear_extent_dirty(tree, cur,
1759                                            cur + iosize - 1, GFP_NOFS);
1760                         cur = cur + iosize;
1761                         page_offset += iosize;
1762                         continue;
1763                 }
1764
1765                 /* leave this out until we have a page_mkwrite call */
1766                 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
1767                                    EXTENT_DIRTY, 0)) {
1768                         cur = cur + iosize;
1769                         page_offset += iosize;
1770                         continue;
1771                 }
1772                 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
1773                 if (tree->ops && tree->ops->writepage_io_hook) {
1774                         ret = tree->ops->writepage_io_hook(page, cur,
1775                                                 cur + iosize - 1);
1776                 } else {
1777                         ret = 0;
1778                 }
1779                 if (ret)
1780                         SetPageError(page);
1781                 else {
1782                         unsigned long nr = end_index + 1;
1783                         set_range_writeback(tree, cur, cur + iosize - 1);
1784
1785                         ret = submit_extent_page(WRITE, tree, page, sector,
1786                                                  iosize, page_offset, bdev,
1787                                                  &epd->bio, nr,
1788                                                  end_bio_extent_writepage);
1789                         if (ret)
1790                                 SetPageError(page);
1791                 }
1792                 cur = cur + iosize;
1793                 page_offset += iosize;
1794                 nr++;
1795         }
1796 done:
1797         unlock_extent(tree, start, page_end, GFP_NOFS);
1798         unlock_page(page);
1799         return 0;
1800 }
1801
1802 int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
1803                           get_extent_t *get_extent,
1804                           struct writeback_control *wbc)
1805 {
1806         int ret;
1807         struct extent_page_data epd = {
1808                 .bio = NULL,
1809                 .tree = tree,
1810                 .get_extent = get_extent,
1811         };
1812
1813         ret = __extent_writepage(page, wbc, &epd);
1814         if (epd.bio)
1815                 submit_one_bio(WRITE, epd.bio);
1816         return ret;
1817 }
1818 EXPORT_SYMBOL(extent_write_full_page);
1819
1820 int extent_writepages(struct extent_map_tree *tree,
1821                       struct address_space *mapping,
1822                       get_extent_t *get_extent,
1823                       struct writeback_control *wbc)
1824 {
1825         int ret;
1826         struct extent_page_data epd = {
1827                 .bio = NULL,
1828                 .tree = tree,
1829                 .get_extent = get_extent,
1830         };
1831
1832         ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
1833         if (epd.bio)
1834                 submit_one_bio(WRITE, epd.bio);
1835         return ret;
1836 }
1837 EXPORT_SYMBOL(extent_writepages);
1838
1839 /*
1840  * basic invalidatepage code, this waits on any locked or writeback
1841  * ranges corresponding to the page, and then deletes any extent state
1842  * records from the tree
1843  */
1844 int extent_invalidatepage(struct extent_map_tree *tree,
1845                           struct page *page, unsigned long offset)
1846 {
1847         u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
1848         u64 end = start + PAGE_CACHE_SIZE - 1;
1849         size_t blocksize = page->mapping->host->i_sb->s_blocksize;
1850
1851         start += (offset + blocksize -1) & ~(blocksize - 1);
1852         if (start > end)
1853                 return 0;
1854
1855         lock_extent(tree, start, end, GFP_NOFS);
1856         wait_on_extent_writeback(tree, start, end);
1857         clear_extent_bit(tree, start, end,
1858                          EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
1859                          1, 1, GFP_NOFS);
1860         return 0;
1861 }
1862 EXPORT_SYMBOL(extent_invalidatepage);
1863
1864 /*
1865  * simple commit_write call, set_range_dirty is used to mark both
1866  * the pages and the extent records as dirty
1867  */
1868 int extent_commit_write(struct extent_map_tree *tree,
1869                         struct inode *inode, struct page *page,
1870                         unsigned from, unsigned to)
1871 {
1872         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1873
1874         set_page_extent_mapped(page);
1875         set_page_dirty(page);
1876
1877         if (pos > inode->i_size) {
1878                 i_size_write(inode, pos);
1879                 mark_inode_dirty(inode);
1880         }
1881         return 0;
1882 }
1883 EXPORT_SYMBOL(extent_commit_write);
1884
1885 int extent_prepare_write(struct extent_map_tree *tree,
1886                          struct inode *inode, struct page *page,
1887                          unsigned from, unsigned to, get_extent_t *get_extent)
1888 {
1889         u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
1890         u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
1891         u64 block_start;
1892         u64 orig_block_start;
1893         u64 block_end;
1894         u64 cur_end;
1895         struct extent_map *em;
1896         unsigned blocksize = 1 << inode->i_blkbits;
1897         size_t page_offset = 0;
1898         size_t block_off_start;
1899         size_t block_off_end;
1900         int err = 0;
1901         int iocount = 0;
1902         int ret = 0;
1903         int isnew;
1904
1905         set_page_extent_mapped(page);
1906
1907         block_start = (page_start + from) & ~((u64)blocksize - 1);
1908         block_end = (page_start + to - 1) | (blocksize - 1);
1909         orig_block_start = block_start;
1910
1911         lock_extent(tree, page_start, page_end, GFP_NOFS);
1912         while(block_start <= block_end) {
1913                 em = get_extent(inode, page, page_offset, block_start,
1914                                 block_end, 1);
1915                 if (IS_ERR(em) || !em) {
1916                         goto err;
1917                 }
1918                 cur_end = min(block_end, em->end);
1919                 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
1920                 block_off_end = block_off_start + blocksize;
1921                 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
1922
1923                 if (!PageUptodate(page) && isnew &&
1924                     (block_off_end > to || block_off_start < from)) {
1925                         void *kaddr;
1926
1927                         kaddr = kmap_atomic(page, KM_USER0);
1928                         if (block_off_end > to)
1929                                 memset(kaddr + to, 0, block_off_end - to);
1930                         if (block_off_start < from)
1931                                 memset(kaddr + block_off_start, 0,
1932                                        from - block_off_start);
1933                         flush_dcache_page(page);
1934                         kunmap_atomic(kaddr, KM_USER0);
1935                 }
1936                 if (!isnew && !PageUptodate(page) &&
1937                     (block_off_end > to || block_off_start < from) &&
1938                     !test_range_bit(tree, block_start, cur_end,
1939                                     EXTENT_UPTODATE, 1)) {
1940                         u64 sector;
1941                         u64 extent_offset = block_start - em->start;
1942                         size_t iosize;
1943                         sector = (em->block_start + extent_offset) >> 9;
1944                         iosize = (cur_end - block_start + blocksize - 1) &
1945                                 ~((u64)blocksize - 1);
1946                         /*
1947                          * we've already got the extent locked, but we
1948                          * need to split the state such that our end_bio
1949                          * handler can clear the lock.
1950                          */
1951                         set_extent_bit(tree, block_start,
1952                                        block_start + iosize - 1,
1953                                        EXTENT_LOCKED, 0, NULL, GFP_NOFS);
1954                         ret = submit_extent_page(READ, tree, page,
1955                                          sector, iosize, page_offset, em->bdev,
1956                                          NULL, 1,
1957                                          end_bio_extent_preparewrite);
1958                         iocount++;
1959                         block_start = block_start + iosize;
1960                 } else {
1961                         set_extent_uptodate(tree, block_start, cur_end,
1962                                             GFP_NOFS);
1963                         unlock_extent(tree, block_start, cur_end, GFP_NOFS);
1964                         block_start = cur_end + 1;
1965                 }
1966                 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
1967                 free_extent_map(em);
1968         }
1969         if (iocount) {
1970                 wait_extent_bit(tree, orig_block_start,
1971                                 block_end, EXTENT_LOCKED);
1972         }
1973         check_page_uptodate(tree, page);
1974 err:
1975         /* FIXME, zero out newly allocated blocks on error */
1976         return err;
1977 }
1978 EXPORT_SYMBOL(extent_prepare_write);
1979
1980 /*
1981  * a helper for releasepage.  As long as there are no locked extents
1982  * in the range corresponding to the page, both state records and extent
1983  * map records are removed
1984  */
1985 int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page)
1986 {
1987         struct extent_map *em;
1988         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1989         u64 end = start + PAGE_CACHE_SIZE - 1;
1990         u64 orig_start = start;
1991         int ret = 1;
1992
1993         while (start <= end) {
1994                 em = lookup_extent_mapping(tree, start, end);
1995                 if (!em || IS_ERR(em))
1996                         break;
1997                 if (!test_range_bit(tree, em->start, em->end,
1998                                     EXTENT_LOCKED, 0)) {
1999                         remove_extent_mapping(tree, em);
2000                         /* once for the rb tree */
2001                         free_extent_map(em);
2002                 }
2003                 start = em->end + 1;
2004                 /* once for us */
2005                 free_extent_map(em);
2006         }
2007         if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0))
2008                 ret = 0;
2009         else
2010                 clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
2011                                  1, 1, GFP_NOFS);
2012         return ret;
2013 }
2014 EXPORT_SYMBOL(try_release_extent_mapping);
2015
2016 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2017                 get_extent_t *get_extent)
2018 {
2019         struct inode *inode = mapping->host;
2020         u64 start = iblock << inode->i_blkbits;
2021         u64 end = start + (1 << inode->i_blkbits) - 1;
2022         sector_t sector = 0;
2023         struct extent_map *em;
2024
2025         em = get_extent(inode, NULL, 0, start, end, 0);
2026         if (!em || IS_ERR(em))
2027                 return 0;
2028
2029         if (em->block_start == EXTENT_MAP_INLINE ||
2030             em->block_start == EXTENT_MAP_HOLE)
2031                 goto out;
2032
2033         sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2034 out:
2035         free_extent_map(em);
2036         return sector;
2037 }
2038
2039 static int add_lru(struct extent_map_tree *tree, struct extent_buffer *eb)
2040 {
2041         if (list_empty(&eb->lru)) {
2042                 extent_buffer_get(eb);
2043                 list_add(&eb->lru, &tree->buffer_lru);
2044                 tree->lru_size++;
2045                 if (tree->lru_size >= BUFFER_LRU_MAX) {
2046                         struct extent_buffer *rm;
2047                         rm = list_entry(tree->buffer_lru.prev,
2048                                         struct extent_buffer, lru);
2049                         tree->lru_size--;
2050                         list_del(&rm->lru);
2051                         free_extent_buffer(rm);
2052                 }
2053         } else
2054                 list_move(&eb->lru, &tree->buffer_lru);
2055         return 0;
2056 }
2057 static struct extent_buffer *find_lru(struct extent_map_tree *tree,
2058                                       u64 start, unsigned long len)
2059 {
2060         struct list_head *lru = &tree->buffer_lru;
2061         struct list_head *cur = lru->next;
2062         struct extent_buffer *eb;
2063
2064         if (list_empty(lru))
2065                 return NULL;
2066
2067         do {
2068                 eb = list_entry(cur, struct extent_buffer, lru);
2069                 if (eb->start == start && eb->len == len) {
2070                         extent_buffer_get(eb);
2071                         return eb;
2072                 }
2073                 cur = cur->next;
2074         } while (cur != lru);
2075         return NULL;
2076 }
2077
2078 static inline unsigned long num_extent_pages(u64 start, u64 len)
2079 {
2080         return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2081                 (start >> PAGE_CACHE_SHIFT);
2082 }
2083
2084 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2085                                               unsigned long i)
2086 {
2087         struct page *p;
2088         struct address_space *mapping;
2089
2090         if (i == 0)
2091                 return eb->first_page;
2092         i += eb->start >> PAGE_CACHE_SHIFT;
2093         mapping = eb->first_page->mapping;
2094         read_lock_irq(&mapping->tree_lock);
2095         p = radix_tree_lookup(&mapping->page_tree, i);
2096         read_unlock_irq(&mapping->tree_lock);
2097         return p;
2098 }
2099
2100 static struct extent_buffer *__alloc_extent_buffer(struct extent_map_tree *tree,
2101                                                    u64 start,
2102                                                    unsigned long len,
2103                                                    gfp_t mask)
2104 {
2105         struct extent_buffer *eb = NULL;
2106
2107         spin_lock(&tree->lru_lock);
2108         eb = find_lru(tree, start, len);
2109         spin_unlock(&tree->lru_lock);
2110         if (eb) {
2111                 return eb;
2112         }
2113
2114         eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2115         INIT_LIST_HEAD(&eb->lru);
2116         eb->start = start;
2117         eb->len = len;
2118         atomic_set(&eb->refs, 1);
2119
2120         return eb;
2121 }
2122
2123 static void __free_extent_buffer(struct extent_buffer *eb)
2124 {
2125         kmem_cache_free(extent_buffer_cache, eb);
2126 }
2127
2128 struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
2129                                           u64 start, unsigned long len,
2130                                           struct page *page0,
2131                                           gfp_t mask)
2132 {
2133         unsigned long num_pages = num_extent_pages(start, len);
2134         unsigned long i;
2135         unsigned long index = start >> PAGE_CACHE_SHIFT;
2136         struct extent_buffer *eb;
2137         struct page *p;
2138         struct address_space *mapping = tree->mapping;
2139         int uptodate = 1;
2140
2141         eb = __alloc_extent_buffer(tree, start, len, mask);
2142         if (!eb || IS_ERR(eb))
2143                 return NULL;
2144
2145         if (eb->flags & EXTENT_BUFFER_FILLED)
2146                 goto lru_add;
2147
2148         if (page0) {
2149                 eb->first_page = page0;
2150                 i = 1;
2151                 index++;
2152                 page_cache_get(page0);
2153                 mark_page_accessed(page0);
2154                 set_page_extent_mapped(page0);
2155                 set_page_private(page0, EXTENT_PAGE_PRIVATE_FIRST_PAGE |
2156                                  len << 2);
2157         } else {
2158                 i = 0;
2159         }
2160         for (; i < num_pages; i++, index++) {
2161                 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2162                 if (!p) {
2163                         WARN_ON(1);
2164                         goto fail;
2165                 }
2166                 set_page_extent_mapped(p);
2167                 mark_page_accessed(p);
2168                 if (i == 0) {
2169                         eb->first_page = p;
2170                         set_page_private(p, EXTENT_PAGE_PRIVATE_FIRST_PAGE |
2171                                          len << 2);
2172                 } else {
2173                         set_page_private(p, EXTENT_PAGE_PRIVATE);
2174                 }
2175                 if (!PageUptodate(p))
2176                         uptodate = 0;
2177                 unlock_page(p);
2178         }
2179         if (uptodate)
2180                 eb->flags |= EXTENT_UPTODATE;
2181         eb->flags |= EXTENT_BUFFER_FILLED;
2182
2183 lru_add:
2184         spin_lock(&tree->lru_lock);
2185         add_lru(tree, eb);
2186         spin_unlock(&tree->lru_lock);
2187         return eb;
2188
2189 fail:
2190         if (!atomic_dec_and_test(&eb->refs))
2191                 return NULL;
2192         for (index = 0; index < i; index++) {
2193                 page_cache_release(extent_buffer_page(eb, index));
2194         }
2195         __free_extent_buffer(eb);
2196         return NULL;
2197 }
2198 EXPORT_SYMBOL(alloc_extent_buffer);
2199
2200 struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
2201                                          u64 start, unsigned long len,
2202                                           gfp_t mask)
2203 {
2204         unsigned long num_pages = num_extent_pages(start, len);
2205         unsigned long i;
2206         unsigned long index = start >> PAGE_CACHE_SHIFT;
2207         struct extent_buffer *eb;
2208         struct page *p;
2209         struct address_space *mapping = tree->mapping;
2210         int uptodate = 1;
2211
2212         eb = __alloc_extent_buffer(tree, start, len, mask);
2213         if (!eb || IS_ERR(eb))
2214                 return NULL;
2215
2216         if (eb->flags & EXTENT_BUFFER_FILLED)
2217                 goto lru_add;
2218
2219         for (i = 0; i < num_pages; i++, index++) {
2220                 p = find_lock_page(mapping, index);
2221                 if (!p) {
2222                         goto fail;
2223                 }
2224                 set_page_extent_mapped(p);
2225                 mark_page_accessed(p);
2226
2227                 if (i == 0) {
2228                         eb->first_page = p;
2229                         set_page_private(p, EXTENT_PAGE_PRIVATE_FIRST_PAGE |
2230                                          len << 2);
2231                 } else {
2232                         set_page_private(p, EXTENT_PAGE_PRIVATE);
2233                 }
2234
2235                 if (!PageUptodate(p))
2236                         uptodate = 0;
2237                 unlock_page(p);
2238         }
2239         if (uptodate)
2240                 eb->flags |= EXTENT_UPTODATE;
2241         eb->flags |= EXTENT_BUFFER_FILLED;
2242
2243 lru_add:
2244         spin_lock(&tree->lru_lock);
2245         add_lru(tree, eb);
2246         spin_unlock(&tree->lru_lock);
2247         return eb;
2248 fail:
2249         if (!atomic_dec_and_test(&eb->refs))
2250                 return NULL;
2251         for (index = 0; index < i; index++) {
2252                 page_cache_release(extent_buffer_page(eb, index));
2253         }
2254         __free_extent_buffer(eb);
2255         return NULL;
2256 }
2257 EXPORT_SYMBOL(find_extent_buffer);
2258
2259 void free_extent_buffer(struct extent_buffer *eb)
2260 {
2261         unsigned long i;
2262         unsigned long num_pages;
2263
2264         if (!eb)
2265                 return;
2266
2267         if (!atomic_dec_and_test(&eb->refs))
2268                 return;
2269
2270         num_pages = num_extent_pages(eb->start, eb->len);
2271
2272         for (i = 0; i < num_pages; i++) {
2273                 page_cache_release(extent_buffer_page(eb, i));
2274         }
2275         __free_extent_buffer(eb);
2276 }
2277 EXPORT_SYMBOL(free_extent_buffer);
2278
2279 int clear_extent_buffer_dirty(struct extent_map_tree *tree,
2280                               struct extent_buffer *eb)
2281 {
2282         int set;
2283         unsigned long i;
2284         unsigned long num_pages;
2285         struct page *page;
2286
2287         u64 start = eb->start;
2288         u64 end = start + eb->len - 1;
2289
2290         set = clear_extent_dirty(tree, start, end, GFP_NOFS);
2291         num_pages = num_extent_pages(eb->start, eb->len);
2292
2293         for (i = 0; i < num_pages; i++) {
2294                 page = extent_buffer_page(eb, i);
2295                 lock_page(page);
2296                 /*
2297                  * if we're on the last page or the first page and the
2298                  * block isn't aligned on a page boundary, do extra checks
2299                  * to make sure we don't clean page that is partially dirty
2300                  */
2301                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2302                     ((i == num_pages - 1) &&
2303                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2304                         start = (u64)page->index << PAGE_CACHE_SHIFT;
2305                         end  = start + PAGE_CACHE_SIZE - 1;
2306                         if (test_range_bit(tree, start, end,
2307                                            EXTENT_DIRTY, 0)) {
2308                                 unlock_page(page);
2309                                 continue;
2310                         }
2311                 }
2312                 clear_page_dirty_for_io(page);
2313                 unlock_page(page);
2314         }
2315         return 0;
2316 }
2317 EXPORT_SYMBOL(clear_extent_buffer_dirty);
2318
2319 int wait_on_extent_buffer_writeback(struct extent_map_tree *tree,
2320                                     struct extent_buffer *eb)
2321 {
2322         return wait_on_extent_writeback(tree, eb->start,
2323                                         eb->start + eb->len - 1);
2324 }
2325 EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
2326
2327 int set_extent_buffer_dirty(struct extent_map_tree *tree,
2328                              struct extent_buffer *eb)
2329 {
2330         unsigned long i;
2331         unsigned long num_pages;
2332
2333         num_pages = num_extent_pages(eb->start, eb->len);
2334         for (i = 0; i < num_pages; i++) {
2335                 struct page *page = extent_buffer_page(eb, i);
2336                 /* writepage may need to do something special for the
2337                  * first page, we have to make sure page->private is
2338                  * properly set.  releasepage may drop page->private
2339                  * on us if the page isn't already dirty.
2340                  */
2341                 if (i == 0) {
2342                         lock_page(page);
2343                         set_page_private(page,
2344                                          EXTENT_PAGE_PRIVATE_FIRST_PAGE |
2345                                          eb->len << 2);
2346                 }
2347                 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
2348                 if (i == 0)
2349                         unlock_page(page);
2350         }
2351         return set_extent_dirty(tree, eb->start,
2352                                 eb->start + eb->len - 1, GFP_NOFS);
2353 }
2354 EXPORT_SYMBOL(set_extent_buffer_dirty);
2355
2356 int set_extent_buffer_uptodate(struct extent_map_tree *tree,
2357                                 struct extent_buffer *eb)
2358 {
2359         unsigned long i;
2360         struct page *page;
2361         unsigned long num_pages;
2362
2363         num_pages = num_extent_pages(eb->start, eb->len);
2364
2365         set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2366                             GFP_NOFS);
2367         for (i = 0; i < num_pages; i++) {
2368                 page = extent_buffer_page(eb, i);
2369                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2370                     ((i == num_pages - 1) &&
2371                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2372                         check_page_uptodate(tree, page);
2373                         continue;
2374                 }
2375                 SetPageUptodate(page);
2376         }
2377         return 0;
2378 }
2379 EXPORT_SYMBOL(set_extent_buffer_uptodate);
2380
2381 int extent_buffer_uptodate(struct extent_map_tree *tree,
2382                              struct extent_buffer *eb)
2383 {
2384         if (eb->flags & EXTENT_UPTODATE)
2385                 return 1;
2386         return test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2387                            EXTENT_UPTODATE, 1);
2388 }
2389 EXPORT_SYMBOL(extent_buffer_uptodate);
2390
2391 int read_extent_buffer_pages(struct extent_map_tree *tree,
2392                              struct extent_buffer *eb,
2393                              u64 start,
2394                              int wait)
2395 {
2396         unsigned long i;
2397         unsigned long start_i;
2398         struct page *page;
2399         int err;
2400         int ret = 0;
2401         unsigned long num_pages;
2402
2403         if (eb->flags & EXTENT_UPTODATE)
2404                 return 0;
2405
2406         if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2407                            EXTENT_UPTODATE, 1)) {
2408                 return 0;
2409         }
2410         if (start) {
2411                 WARN_ON(start < eb->start);
2412                 start_i = (start >> PAGE_CACHE_SHIFT) -
2413                         (eb->start >> PAGE_CACHE_SHIFT);
2414         } else {
2415                 start_i = 0;
2416         }
2417
2418         num_pages = num_extent_pages(eb->start, eb->len);
2419         for (i = start_i; i < num_pages; i++) {
2420                 page = extent_buffer_page(eb, i);
2421                 if (PageUptodate(page)) {
2422                         continue;
2423                 }
2424                 if (!wait) {
2425                         if (TestSetPageLocked(page)) {
2426                                 continue;
2427                         }
2428                 } else {
2429                         lock_page(page);
2430                 }
2431                 if (!PageUptodate(page)) {
2432                         err = page->mapping->a_ops->readpage(NULL, page);
2433                         if (err) {
2434                                 ret = err;
2435                         }
2436                 } else {
2437                         unlock_page(page);
2438                 }
2439         }
2440
2441         if (ret || !wait) {
2442                 return ret;
2443         }
2444
2445         for (i = start_i; i < num_pages; i++) {
2446                 page = extent_buffer_page(eb, i);
2447                 wait_on_page_locked(page);
2448                 if (!PageUptodate(page)) {
2449                         ret = -EIO;
2450                 }
2451         }
2452         if (!ret)
2453                 eb->flags |= EXTENT_UPTODATE;
2454         return ret;
2455 }
2456 EXPORT_SYMBOL(read_extent_buffer_pages);
2457
2458 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
2459                         unsigned long start,
2460                         unsigned long len)
2461 {
2462         size_t cur;
2463         size_t offset;
2464         struct page *page;
2465         char *kaddr;
2466         char *dst = (char *)dstv;
2467         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2468         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2469         unsigned long num_pages = num_extent_pages(eb->start, eb->len);
2470
2471         WARN_ON(start > eb->len);
2472         WARN_ON(start + len > eb->start + eb->len);
2473
2474         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
2475
2476         while(len > 0) {
2477                 page = extent_buffer_page(eb, i);
2478                 if (!PageUptodate(page)) {
2479                         printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len);
2480                         WARN_ON(1);
2481                 }
2482                 WARN_ON(!PageUptodate(page));
2483
2484                 cur = min(len, (PAGE_CACHE_SIZE - offset));
2485                 kaddr = kmap_atomic(page, KM_USER1);
2486                 memcpy(dst, kaddr + offset, cur);
2487                 kunmap_atomic(kaddr, KM_USER1);
2488
2489                 dst += cur;
2490                 len -= cur;
2491                 offset = 0;
2492                 i++;
2493         }
2494 }
2495 EXPORT_SYMBOL(read_extent_buffer);
2496
2497 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
2498                                unsigned long min_len, char **token, char **map,
2499                                unsigned long *map_start,
2500                                unsigned long *map_len, int km)
2501 {
2502         size_t offset = start & (PAGE_CACHE_SIZE - 1);
2503         char *kaddr;
2504         struct page *p;
2505         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2506         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2507         unsigned long end_i = (start_offset + start + min_len - 1) >>
2508                 PAGE_CACHE_SHIFT;
2509
2510         if (i != end_i)
2511                 return -EINVAL;
2512
2513         if (i == 0) {
2514                 offset = start_offset;
2515                 *map_start = 0;
2516         } else {
2517                 offset = 0;
2518                 *map_start = (i << PAGE_CACHE_SHIFT) - start_offset;
2519         }
2520         if (start + min_len > eb->len) {
2521 printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
2522                 WARN_ON(1);
2523         }
2524
2525         p = extent_buffer_page(eb, i);
2526         WARN_ON(!PageUptodate(p));
2527         kaddr = kmap_atomic(p, km);
2528         *token = kaddr;
2529         *map = kaddr + offset;
2530         *map_len = PAGE_CACHE_SIZE - offset;
2531         return 0;
2532 }
2533 EXPORT_SYMBOL(map_private_extent_buffer);
2534
2535 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
2536                       unsigned long min_len,
2537                       char **token, char **map,
2538                       unsigned long *map_start,
2539                       unsigned long *map_len, int km)
2540 {
2541         int err;
2542         int save = 0;
2543         if (eb->map_token) {
2544                 unmap_extent_buffer(eb, eb->map_token, km);
2545                 eb->map_token = NULL;
2546                 save = 1;
2547         }
2548         err = map_private_extent_buffer(eb, start, min_len, token, map,
2549                                        map_start, map_len, km);
2550         if (!err && save) {
2551                 eb->map_token = *token;
2552                 eb->kaddr = *map;
2553                 eb->map_start = *map_start;
2554                 eb->map_len = *map_len;
2555         }
2556         return err;
2557 }
2558 EXPORT_SYMBOL(map_extent_buffer);
2559
2560 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
2561 {
2562         kunmap_atomic(token, km);
2563 }
2564 EXPORT_SYMBOL(unmap_extent_buffer);
2565
2566 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
2567                           unsigned long start,
2568                           unsigned long len)
2569 {
2570         size_t cur;
2571         size_t offset;
2572         struct page *page;
2573         char *kaddr;
2574         char *ptr = (char *)ptrv;
2575         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2576         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2577         int ret = 0;
2578
2579         WARN_ON(start > eb->len);
2580         WARN_ON(start + len > eb->start + eb->len);
2581
2582         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
2583
2584         while(len > 0) {
2585                 page = extent_buffer_page(eb, i);
2586                 WARN_ON(!PageUptodate(page));
2587
2588                 cur = min(len, (PAGE_CACHE_SIZE - offset));
2589
2590                 kaddr = kmap_atomic(page, KM_USER0);
2591                 ret = memcmp(ptr, kaddr + offset, cur);
2592                 kunmap_atomic(kaddr, KM_USER0);
2593                 if (ret)
2594                         break;
2595
2596                 ptr += cur;
2597                 len -= cur;
2598                 offset = 0;
2599                 i++;
2600         }
2601         return ret;
2602 }
2603 EXPORT_SYMBOL(memcmp_extent_buffer);
2604
2605 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
2606                          unsigned long start, unsigned long len)
2607 {
2608         size_t cur;
2609         size_t offset;
2610         struct page *page;
2611         char *kaddr;
2612         char *src = (char *)srcv;
2613         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2614         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2615
2616         WARN_ON(start > eb->len);
2617         WARN_ON(start + len > eb->start + eb->len);
2618
2619         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
2620
2621         while(len > 0) {
2622                 page = extent_buffer_page(eb, i);
2623                 WARN_ON(!PageUptodate(page));
2624
2625                 cur = min(len, PAGE_CACHE_SIZE - offset);
2626                 kaddr = kmap_atomic(page, KM_USER1);
2627                 memcpy(kaddr + offset, src, cur);
2628                 kunmap_atomic(kaddr, KM_USER1);
2629
2630                 src += cur;
2631                 len -= cur;
2632                 offset = 0;
2633                 i++;
2634         }
2635 }
2636 EXPORT_SYMBOL(write_extent_buffer);
2637
2638 void memset_extent_buffer(struct extent_buffer *eb, char c,
2639                           unsigned long start, unsigned long len)
2640 {
2641         size_t cur;
2642         size_t offset;
2643         struct page *page;
2644         char *kaddr;
2645         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2646         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2647
2648         WARN_ON(start > eb->len);
2649         WARN_ON(start + len > eb->start + eb->len);
2650
2651         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
2652
2653         while(len > 0) {
2654                 page = extent_buffer_page(eb, i);
2655                 WARN_ON(!PageUptodate(page));
2656
2657                 cur = min(len, PAGE_CACHE_SIZE - offset);
2658                 kaddr = kmap_atomic(page, KM_USER0);
2659                 memset(kaddr + offset, c, cur);
2660                 kunmap_atomic(kaddr, KM_USER0);
2661
2662                 len -= cur;
2663                 offset = 0;
2664                 i++;
2665         }
2666 }
2667 EXPORT_SYMBOL(memset_extent_buffer);
2668
2669 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
2670                         unsigned long dst_offset, unsigned long src_offset,
2671                         unsigned long len)
2672 {
2673         u64 dst_len = dst->len;
2674         size_t cur;
2675         size_t offset;
2676         struct page *page;
2677         char *kaddr;
2678         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
2679         unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
2680
2681         WARN_ON(src->len != dst_len);
2682
2683         offset = (start_offset + dst_offset) &
2684                 ((unsigned long)PAGE_CACHE_SIZE - 1);
2685
2686         while(len > 0) {
2687                 page = extent_buffer_page(dst, i);
2688                 WARN_ON(!PageUptodate(page));
2689
2690                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
2691
2692                 kaddr = kmap_atomic(page, KM_USER0);
2693                 read_extent_buffer(src, kaddr + offset, src_offset, cur);
2694                 kunmap_atomic(kaddr, KM_USER0);
2695
2696                 src_offset += cur;
2697                 len -= cur;
2698                 offset = 0;
2699                 i++;
2700         }
2701 }
2702 EXPORT_SYMBOL(copy_extent_buffer);
2703
2704 static void move_pages(struct page *dst_page, struct page *src_page,
2705                        unsigned long dst_off, unsigned long src_off,
2706                        unsigned long len)
2707 {
2708         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
2709         if (dst_page == src_page) {
2710                 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
2711         } else {
2712                 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
2713                 char *p = dst_kaddr + dst_off + len;
2714                 char *s = src_kaddr + src_off + len;
2715
2716                 while (len--)
2717                         *--p = *--s;
2718
2719                 kunmap_atomic(src_kaddr, KM_USER1);
2720         }
2721         kunmap_atomic(dst_kaddr, KM_USER0);
2722 }
2723
2724 static void copy_pages(struct page *dst_page, struct page *src_page,
2725                        unsigned long dst_off, unsigned long src_off,
2726                        unsigned long len)
2727 {
2728         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
2729         char *src_kaddr;
2730
2731         if (dst_page != src_page)
2732                 src_kaddr = kmap_atomic(src_page, KM_USER1);
2733         else
2734                 src_kaddr = dst_kaddr;
2735
2736         memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
2737         kunmap_atomic(dst_kaddr, KM_USER0);
2738         if (dst_page != src_page)
2739                 kunmap_atomic(src_kaddr, KM_USER1);
2740 }
2741
2742 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
2743                            unsigned long src_offset, unsigned long len)
2744 {
2745         size_t cur;
2746         size_t dst_off_in_page;
2747         size_t src_off_in_page;
2748         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
2749         unsigned long dst_i;
2750         unsigned long src_i;
2751
2752         if (src_offset + len > dst->len) {
2753                 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
2754                        src_offset, len, dst->len);
2755                 BUG_ON(1);
2756         }
2757         if (dst_offset + len > dst->len) {
2758                 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
2759                        dst_offset, len, dst->len);
2760                 BUG_ON(1);
2761         }
2762
2763         while(len > 0) {
2764                 dst_off_in_page = (start_offset + dst_offset) &
2765                         ((unsigned long)PAGE_CACHE_SIZE - 1);
2766                 src_off_in_page = (start_offset + src_offset) &
2767                         ((unsigned long)PAGE_CACHE_SIZE - 1);
2768
2769                 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
2770                 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
2771
2772                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
2773                                                src_off_in_page));
2774                 cur = min_t(unsigned long, cur,
2775                         (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
2776
2777                 copy_pages(extent_buffer_page(dst, dst_i),
2778                            extent_buffer_page(dst, src_i),
2779                            dst_off_in_page, src_off_in_page, cur);
2780
2781                 src_offset += cur;
2782                 dst_offset += cur;
2783                 len -= cur;
2784         }
2785 }
2786 EXPORT_SYMBOL(memcpy_extent_buffer);
2787
2788 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
2789                            unsigned long src_offset, unsigned long len)
2790 {
2791         size_t cur;
2792         size_t dst_off_in_page;
2793         size_t src_off_in_page;
2794         unsigned long dst_end = dst_offset + len - 1;
2795         unsigned long src_end = src_offset + len - 1;
2796         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
2797         unsigned long dst_i;
2798         unsigned long src_i;
2799
2800         if (src_offset + len > dst->len) {
2801                 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
2802                        src_offset, len, dst->len);
2803                 BUG_ON(1);
2804         }
2805         if (dst_offset + len > dst->len) {
2806                 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
2807                        dst_offset, len, dst->len);
2808                 BUG_ON(1);
2809         }
2810         if (dst_offset < src_offset) {
2811                 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
2812                 return;
2813         }
2814         while(len > 0) {
2815                 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
2816                 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
2817
2818                 dst_off_in_page = (start_offset + dst_end) &
2819                         ((unsigned long)PAGE_CACHE_SIZE - 1);
2820                 src_off_in_page = (start_offset + src_end) &
2821                         ((unsigned long)PAGE_CACHE_SIZE - 1);
2822
2823                 cur = min_t(unsigned long, len, src_off_in_page + 1);
2824                 cur = min(cur, dst_off_in_page + 1);
2825                 move_pages(extent_buffer_page(dst, dst_i),
2826                            extent_buffer_page(dst, src_i),
2827                            dst_off_in_page - cur + 1,
2828                            src_off_in_page - cur + 1, cur);
2829
2830                 dst_end -= cur;
2831                 src_end -= cur;
2832                 len -= cur;
2833         }
2834 }
2835 EXPORT_SYMBOL(memmove_extent_buffer);