Btrfs: Fix hole insertion corner cases
[safe/jmp/linux-2.6] / fs / btrfs / extent_map.c
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/gfp.h>
6 #include <linux/pagemap.h>
7 #include <linux/page-flags.h>
8 #include <linux/module.h>
9 #include <linux/spinlock.h>
10 #include <linux/blkdev.h>
11 #include <linux/swap.h>
12 #include <linux/version.h>
13 #include <linux/writeback.h>
14 #include <linux/pagevec.h>
15 #include "extent_map.h"
16
17 /* temporary define until extent_map moves out of btrfs */
18 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
19                                        unsigned long extra_flags,
20                                        void (*ctor)(void *, struct kmem_cache *,
21                                                     unsigned long));
22
23 static struct kmem_cache *extent_map_cache;
24 static struct kmem_cache *extent_state_cache;
25 static struct kmem_cache *extent_buffer_cache;
26
27 static LIST_HEAD(buffers);
28 static LIST_HEAD(states);
29
30 static spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
31 #define BUFFER_LRU_MAX 64
32
33 struct tree_entry {
34         u64 start;
35         u64 end;
36         int in_tree;
37         struct rb_node rb_node;
38 };
39
40 struct extent_page_data {
41         struct bio *bio;
42         struct extent_map_tree *tree;
43         get_extent_t *get_extent;
44 };
45
46 int __init extent_map_init(void)
47 {
48         extent_map_cache = btrfs_cache_create("extent_map",
49                                             sizeof(struct extent_map), 0,
50                                             NULL);
51         if (!extent_map_cache)
52                 return -ENOMEM;
53         extent_state_cache = btrfs_cache_create("extent_state",
54                                             sizeof(struct extent_state), 0,
55                                             NULL);
56         if (!extent_state_cache)
57                 goto free_map_cache;
58         extent_buffer_cache = btrfs_cache_create("extent_buffers",
59                                             sizeof(struct extent_buffer), 0,
60                                             NULL);
61         if (!extent_buffer_cache)
62                 goto free_state_cache;
63         return 0;
64
65 free_state_cache:
66         kmem_cache_destroy(extent_state_cache);
67 free_map_cache:
68         kmem_cache_destroy(extent_map_cache);
69         return -ENOMEM;
70 }
71
72 void extent_map_exit(void)
73 {
74         struct extent_state *state;
75
76         while (!list_empty(&states)) {
77                 state = list_entry(states.next, struct extent_state, list);
78                 printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs));
79                 list_del(&state->list);
80                 kmem_cache_free(extent_state_cache, state);
81
82         }
83
84         if (extent_map_cache)
85                 kmem_cache_destroy(extent_map_cache);
86         if (extent_state_cache)
87                 kmem_cache_destroy(extent_state_cache);
88         if (extent_buffer_cache)
89                 kmem_cache_destroy(extent_buffer_cache);
90 }
91
92 void extent_map_tree_init(struct extent_map_tree *tree,
93                           struct address_space *mapping, gfp_t mask)
94 {
95         tree->map.rb_node = NULL;
96         tree->state.rb_node = NULL;
97         tree->ops = NULL;
98         tree->dirty_bytes = 0;
99         rwlock_init(&tree->lock);
100         spin_lock_init(&tree->lru_lock);
101         tree->mapping = mapping;
102         INIT_LIST_HEAD(&tree->buffer_lru);
103         tree->lru_size = 0;
104 }
105 EXPORT_SYMBOL(extent_map_tree_init);
106
107 void extent_map_tree_empty_lru(struct extent_map_tree *tree)
108 {
109         struct extent_buffer *eb;
110         while(!list_empty(&tree->buffer_lru)) {
111                 eb = list_entry(tree->buffer_lru.next, struct extent_buffer,
112                                 lru);
113                 list_del_init(&eb->lru);
114                 free_extent_buffer(eb);
115         }
116 }
117 EXPORT_SYMBOL(extent_map_tree_empty_lru);
118
119 struct extent_map *alloc_extent_map(gfp_t mask)
120 {
121         struct extent_map *em;
122         em = kmem_cache_alloc(extent_map_cache, mask);
123         if (!em || IS_ERR(em))
124                 return em;
125         em->in_tree = 0;
126         atomic_set(&em->refs, 1);
127         return em;
128 }
129 EXPORT_SYMBOL(alloc_extent_map);
130
131 void free_extent_map(struct extent_map *em)
132 {
133         if (!em)
134                 return;
135         if (atomic_dec_and_test(&em->refs)) {
136                 WARN_ON(em->in_tree);
137                 kmem_cache_free(extent_map_cache, em);
138         }
139 }
140 EXPORT_SYMBOL(free_extent_map);
141
142
143 struct extent_state *alloc_extent_state(gfp_t mask)
144 {
145         struct extent_state *state;
146         unsigned long flags;
147
148         state = kmem_cache_alloc(extent_state_cache, mask);
149         if (!state || IS_ERR(state))
150                 return state;
151         state->state = 0;
152         state->in_tree = 0;
153         state->private = 0;
154
155         spin_lock_irqsave(&state_lock, flags);
156         list_add(&state->list, &states);
157         spin_unlock_irqrestore(&state_lock, flags);
158
159         atomic_set(&state->refs, 1);
160         init_waitqueue_head(&state->wq);
161         return state;
162 }
163 EXPORT_SYMBOL(alloc_extent_state);
164
165 void free_extent_state(struct extent_state *state)
166 {
167         unsigned long flags;
168         if (!state)
169                 return;
170         if (atomic_dec_and_test(&state->refs)) {
171                 WARN_ON(state->in_tree);
172                 spin_lock_irqsave(&state_lock, flags);
173                 list_del(&state->list);
174                 spin_unlock_irqrestore(&state_lock, flags);
175                 kmem_cache_free(extent_state_cache, state);
176         }
177 }
178 EXPORT_SYMBOL(free_extent_state);
179
180 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
181                                    struct rb_node *node)
182 {
183         struct rb_node ** p = &root->rb_node;
184         struct rb_node * parent = NULL;
185         struct tree_entry *entry;
186
187         while(*p) {
188                 parent = *p;
189                 entry = rb_entry(parent, struct tree_entry, rb_node);
190
191                 if (offset < entry->start)
192                         p = &(*p)->rb_left;
193                 else if (offset > entry->end)
194                         p = &(*p)->rb_right;
195                 else
196                         return parent;
197         }
198
199         entry = rb_entry(node, struct tree_entry, rb_node);
200         entry->in_tree = 1;
201         rb_link_node(node, parent, p);
202         rb_insert_color(node, root);
203         return NULL;
204 }
205
206 static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
207                                      struct rb_node **prev_ret,
208                                      struct rb_node **next_ret)
209 {
210         struct rb_node * n = root->rb_node;
211         struct rb_node *prev = NULL;
212         struct rb_node *orig_prev = NULL;
213         struct tree_entry *entry;
214         struct tree_entry *prev_entry = NULL;
215
216         while(n) {
217                 entry = rb_entry(n, struct tree_entry, rb_node);
218                 prev = n;
219                 prev_entry = entry;
220
221                 if (offset < entry->start)
222                         n = n->rb_left;
223                 else if (offset > entry->end)
224                         n = n->rb_right;
225                 else
226                         return n;
227         }
228
229         if (prev_ret) {
230                 orig_prev = prev;
231                 while(prev && offset > prev_entry->end) {
232                         prev = rb_next(prev);
233                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
234                 }
235                 *prev_ret = prev;
236                 prev = orig_prev;
237         }
238
239         if (next_ret) {
240                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
241                 while(prev && offset < prev_entry->start) {
242                         prev = rb_prev(prev);
243                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
244                 }
245                 *next_ret = prev;
246         }
247         return NULL;
248 }
249
250 static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
251 {
252         struct rb_node *prev;
253         struct rb_node *ret;
254         ret = __tree_search(root, offset, &prev, NULL);
255         if (!ret)
256                 return prev;
257         return ret;
258 }
259
260 static int tree_delete(struct rb_root *root, u64 offset)
261 {
262         struct rb_node *node;
263         struct tree_entry *entry;
264
265         node = __tree_search(root, offset, NULL, NULL);
266         if (!node)
267                 return -ENOENT;
268         entry = rb_entry(node, struct tree_entry, rb_node);
269         entry->in_tree = 0;
270         rb_erase(node, root);
271         return 0;
272 }
273
274 /*
275  * add_extent_mapping tries a simple backward merge with existing
276  * mappings.  The extent_map struct passed in will be inserted into
277  * the tree directly (no copies made, just a reference taken).
278  */
279 int add_extent_mapping(struct extent_map_tree *tree,
280                        struct extent_map *em)
281 {
282         int ret = 0;
283         struct extent_map *prev = NULL;
284         struct rb_node *rb;
285
286         write_lock_irq(&tree->lock);
287         rb = tree_insert(&tree->map, em->end, &em->rb_node);
288         if (rb) {
289                 prev = rb_entry(rb, struct extent_map, rb_node);
290                 ret = -EEXIST;
291                 goto out;
292         }
293         atomic_inc(&em->refs);
294         if (em->start != 0) {
295                 rb = rb_prev(&em->rb_node);
296                 if (rb)
297                         prev = rb_entry(rb, struct extent_map, rb_node);
298                 if (prev && prev->end + 1 == em->start &&
299                     ((em->block_start == EXTENT_MAP_HOLE &&
300                       prev->block_start == EXTENT_MAP_HOLE) ||
301                      (em->block_start == EXTENT_MAP_INLINE &&
302                       prev->block_start == EXTENT_MAP_INLINE) ||
303                      (em->block_start == EXTENT_MAP_DELALLOC &&
304                       prev->block_start == EXTENT_MAP_DELALLOC) ||
305                      (em->block_start < EXTENT_MAP_DELALLOC - 1 &&
306                       em->block_start == prev->block_end + 1))) {
307                         em->start = prev->start;
308                         em->block_start = prev->block_start;
309                         rb_erase(&prev->rb_node, &tree->map);
310                         prev->in_tree = 0;
311                         free_extent_map(prev);
312                 }
313          }
314 out:
315         write_unlock_irq(&tree->lock);
316         return ret;
317 }
318 EXPORT_SYMBOL(add_extent_mapping);
319
320 /*
321  * lookup_extent_mapping returns the first extent_map struct in the
322  * tree that intersects the [start, end] (inclusive) range.  There may
323  * be additional objects in the tree that intersect, so check the object
324  * returned carefully to make sure you don't need additional lookups.
325  */
326 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
327                                          u64 start, u64 end)
328 {
329         struct extent_map *em;
330         struct rb_node *rb_node;
331         struct rb_node *prev = NULL;
332         struct rb_node *next = NULL;
333
334         read_lock_irq(&tree->lock);
335         rb_node = __tree_search(&tree->map, start, &prev, &next);
336         if (!rb_node && prev) {
337                 em = rb_entry(prev, struct extent_map, rb_node);
338                 if (em->start <= end && em->end >= start)
339                         goto found;
340         }
341         if (!rb_node && next) {
342                 em = rb_entry(next, struct extent_map, rb_node);
343                 if (em->start <= end && em->end >= start)
344                         goto found;
345         }
346         if (!rb_node) {
347                 em = NULL;
348                 goto out;
349         }
350         if (IS_ERR(rb_node)) {
351                 em = ERR_PTR(PTR_ERR(rb_node));
352                 goto out;
353         }
354         em = rb_entry(rb_node, struct extent_map, rb_node);
355         if (em->end < start || em->start > end) {
356                 em = NULL;
357                 goto out;
358         }
359 found:
360         atomic_inc(&em->refs);
361 out:
362         read_unlock_irq(&tree->lock);
363         return em;
364 }
365 EXPORT_SYMBOL(lookup_extent_mapping);
366
367 /*
368  * removes an extent_map struct from the tree.  No reference counts are
369  * dropped, and no checks are done to  see if the range is in use
370  */
371 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
372 {
373         int ret;
374
375         write_lock_irq(&tree->lock);
376         ret = tree_delete(&tree->map, em->end);
377         write_unlock_irq(&tree->lock);
378         return ret;
379 }
380 EXPORT_SYMBOL(remove_extent_mapping);
381
382 /*
383  * utility function to look for merge candidates inside a given range.
384  * Any extents with matching state are merged together into a single
385  * extent in the tree.  Extents with EXTENT_IO in their state field
386  * are not merged because the end_io handlers need to be able to do
387  * operations on them without sleeping (or doing allocations/splits).
388  *
389  * This should be called with the tree lock held.
390  */
391 static int merge_state(struct extent_map_tree *tree,
392                        struct extent_state *state)
393 {
394         struct extent_state *other;
395         struct rb_node *other_node;
396
397         if (state->state & EXTENT_IOBITS)
398                 return 0;
399
400         other_node = rb_prev(&state->rb_node);
401         if (other_node) {
402                 other = rb_entry(other_node, struct extent_state, rb_node);
403                 if (other->end == state->start - 1 &&
404                     other->state == state->state) {
405                         state->start = other->start;
406                         other->in_tree = 0;
407                         rb_erase(&other->rb_node, &tree->state);
408                         free_extent_state(other);
409                 }
410         }
411         other_node = rb_next(&state->rb_node);
412         if (other_node) {
413                 other = rb_entry(other_node, struct extent_state, rb_node);
414                 if (other->start == state->end + 1 &&
415                     other->state == state->state) {
416                         other->start = state->start;
417                         state->in_tree = 0;
418                         rb_erase(&state->rb_node, &tree->state);
419                         free_extent_state(state);
420                 }
421         }
422         return 0;
423 }
424
425 /*
426  * insert an extent_state struct into the tree.  'bits' are set on the
427  * struct before it is inserted.
428  *
429  * This may return -EEXIST if the extent is already there, in which case the
430  * state struct is freed.
431  *
432  * The tree lock is not taken internally.  This is a utility function and
433  * probably isn't what you want to call (see set/clear_extent_bit).
434  */
435 static int insert_state(struct extent_map_tree *tree,
436                         struct extent_state *state, u64 start, u64 end,
437                         int bits)
438 {
439         struct rb_node *node;
440
441         if (end < start) {
442                 printk("end < start %Lu %Lu\n", end, start);
443                 WARN_ON(1);
444         }
445         if (bits & EXTENT_DIRTY)
446                 tree->dirty_bytes += end - start + 1;
447         state->state |= bits;
448         state->start = start;
449         state->end = end;
450         node = tree_insert(&tree->state, end, &state->rb_node);
451         if (node) {
452                 struct extent_state *found;
453                 found = rb_entry(node, struct extent_state, rb_node);
454                 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
455                 free_extent_state(state);
456                 return -EEXIST;
457         }
458         merge_state(tree, state);
459         return 0;
460 }
461
462 /*
463  * split a given extent state struct in two, inserting the preallocated
464  * struct 'prealloc' as the newly created second half.  'split' indicates an
465  * offset inside 'orig' where it should be split.
466  *
467  * Before calling,
468  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
469  * are two extent state structs in the tree:
470  * prealloc: [orig->start, split - 1]
471  * orig: [ split, orig->end ]
472  *
473  * The tree locks are not taken by this function. They need to be held
474  * by the caller.
475  */
476 static int split_state(struct extent_map_tree *tree, struct extent_state *orig,
477                        struct extent_state *prealloc, u64 split)
478 {
479         struct rb_node *node;
480         prealloc->start = orig->start;
481         prealloc->end = split - 1;
482         prealloc->state = orig->state;
483         orig->start = split;
484
485         node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
486         if (node) {
487                 struct extent_state *found;
488                 found = rb_entry(node, struct extent_state, rb_node);
489                 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
490                 free_extent_state(prealloc);
491                 return -EEXIST;
492         }
493         return 0;
494 }
495
496 /*
497  * utility function to clear some bits in an extent state struct.
498  * it will optionally wake up any one waiting on this state (wake == 1), or
499  * forcibly remove the state from the tree (delete == 1).
500  *
501  * If no bits are set on the state struct after clearing things, the
502  * struct is freed and removed from the tree
503  */
504 static int clear_state_bit(struct extent_map_tree *tree,
505                             struct extent_state *state, int bits, int wake,
506                             int delete)
507 {
508         int ret = state->state & bits;
509
510         if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
511                 u64 range = state->end - state->start + 1;
512                 WARN_ON(range > tree->dirty_bytes);
513                 tree->dirty_bytes -= range;
514         }
515         state->state &= ~bits;
516         if (wake)
517                 wake_up(&state->wq);
518         if (delete || state->state == 0) {
519                 if (state->in_tree) {
520                         rb_erase(&state->rb_node, &tree->state);
521                         state->in_tree = 0;
522                         free_extent_state(state);
523                 } else {
524                         WARN_ON(1);
525                 }
526         } else {
527                 merge_state(tree, state);
528         }
529         return ret;
530 }
531
532 /*
533  * clear some bits on a range in the tree.  This may require splitting
534  * or inserting elements in the tree, so the gfp mask is used to
535  * indicate which allocations or sleeping are allowed.
536  *
537  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
538  * the given range from the tree regardless of state (ie for truncate).
539  *
540  * the range [start, end] is inclusive.
541  *
542  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
543  * bits were already set, or zero if none of the bits were already set.
544  */
545 int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end,
546                      int bits, int wake, int delete, gfp_t mask)
547 {
548         struct extent_state *state;
549         struct extent_state *prealloc = NULL;
550         struct rb_node *node;
551         unsigned long flags;
552         int err;
553         int set = 0;
554
555 again:
556         if (!prealloc && (mask & __GFP_WAIT)) {
557                 prealloc = alloc_extent_state(mask);
558                 if (!prealloc)
559                         return -ENOMEM;
560         }
561
562         write_lock_irqsave(&tree->lock, flags);
563         /*
564          * this search will find the extents that end after
565          * our range starts
566          */
567         node = tree_search(&tree->state, start);
568         if (!node)
569                 goto out;
570         state = rb_entry(node, struct extent_state, rb_node);
571         if (state->start > end)
572                 goto out;
573         WARN_ON(state->end < start);
574
575         /*
576          *     | ---- desired range ---- |
577          *  | state | or
578          *  | ------------- state -------------- |
579          *
580          * We need to split the extent we found, and may flip
581          * bits on second half.
582          *
583          * If the extent we found extends past our range, we
584          * just split and search again.  It'll get split again
585          * the next time though.
586          *
587          * If the extent we found is inside our range, we clear
588          * the desired bit on it.
589          */
590
591         if (state->start < start) {
592                 err = split_state(tree, state, prealloc, start);
593                 BUG_ON(err == -EEXIST);
594                 prealloc = NULL;
595                 if (err)
596                         goto out;
597                 if (state->end <= end) {
598                         start = state->end + 1;
599                         set |= clear_state_bit(tree, state, bits,
600                                         wake, delete);
601                 } else {
602                         start = state->start;
603                 }
604                 goto search_again;
605         }
606         /*
607          * | ---- desired range ---- |
608          *                        | state |
609          * We need to split the extent, and clear the bit
610          * on the first half
611          */
612         if (state->start <= end && state->end > end) {
613                 err = split_state(tree, state, prealloc, end + 1);
614                 BUG_ON(err == -EEXIST);
615
616                 if (wake)
617                         wake_up(&state->wq);
618                 set |= clear_state_bit(tree, prealloc, bits,
619                                        wake, delete);
620                 prealloc = NULL;
621                 goto out;
622         }
623
624         start = state->end + 1;
625         set |= clear_state_bit(tree, state, bits, wake, delete);
626         goto search_again;
627
628 out:
629         write_unlock_irqrestore(&tree->lock, flags);
630         if (prealloc)
631                 free_extent_state(prealloc);
632
633         return set;
634
635 search_again:
636         if (start > end)
637                 goto out;
638         write_unlock_irqrestore(&tree->lock, flags);
639         if (mask & __GFP_WAIT)
640                 cond_resched();
641         goto again;
642 }
643 EXPORT_SYMBOL(clear_extent_bit);
644
645 static int wait_on_state(struct extent_map_tree *tree,
646                          struct extent_state *state)
647 {
648         DEFINE_WAIT(wait);
649         prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
650         read_unlock_irq(&tree->lock);
651         schedule();
652         read_lock_irq(&tree->lock);
653         finish_wait(&state->wq, &wait);
654         return 0;
655 }
656
657 /*
658  * waits for one or more bits to clear on a range in the state tree.
659  * The range [start, end] is inclusive.
660  * The tree lock is taken by this function
661  */
662 int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits)
663 {
664         struct extent_state *state;
665         struct rb_node *node;
666
667         read_lock_irq(&tree->lock);
668 again:
669         while (1) {
670                 /*
671                  * this search will find all the extents that end after
672                  * our range starts
673                  */
674                 node = tree_search(&tree->state, start);
675                 if (!node)
676                         break;
677
678                 state = rb_entry(node, struct extent_state, rb_node);
679
680                 if (state->start > end)
681                         goto out;
682
683                 if (state->state & bits) {
684                         start = state->start;
685                         atomic_inc(&state->refs);
686                         wait_on_state(tree, state);
687                         free_extent_state(state);
688                         goto again;
689                 }
690                 start = state->end + 1;
691
692                 if (start > end)
693                         break;
694
695                 if (need_resched()) {
696                         read_unlock_irq(&tree->lock);
697                         cond_resched();
698                         read_lock_irq(&tree->lock);
699                 }
700         }
701 out:
702         read_unlock_irq(&tree->lock);
703         return 0;
704 }
705 EXPORT_SYMBOL(wait_extent_bit);
706
707 static void set_state_bits(struct extent_map_tree *tree,
708                            struct extent_state *state,
709                            int bits)
710 {
711         if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
712                 u64 range = state->end - state->start + 1;
713                 tree->dirty_bytes += range;
714         }
715         state->state |= bits;
716 }
717
718 /*
719  * set some bits on a range in the tree.  This may require allocations
720  * or sleeping, so the gfp mask is used to indicate what is allowed.
721  *
722  * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
723  * range already has the desired bits set.  The start of the existing
724  * range is returned in failed_start in this case.
725  *
726  * [start, end] is inclusive
727  * This takes the tree lock.
728  */
729 int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits,
730                    int exclusive, u64 *failed_start, gfp_t mask)
731 {
732         struct extent_state *state;
733         struct extent_state *prealloc = NULL;
734         struct rb_node *node;
735         unsigned long flags;
736         int err = 0;
737         int set;
738         u64 last_start;
739         u64 last_end;
740 again:
741         if (!prealloc && (mask & __GFP_WAIT)) {
742                 prealloc = alloc_extent_state(mask);
743                 if (!prealloc)
744                         return -ENOMEM;
745         }
746
747         write_lock_irqsave(&tree->lock, flags);
748         /*
749          * this search will find all the extents that end after
750          * our range starts.
751          */
752         node = tree_search(&tree->state, start);
753         if (!node) {
754                 err = insert_state(tree, prealloc, start, end, bits);
755                 prealloc = NULL;
756                 BUG_ON(err == -EEXIST);
757                 goto out;
758         }
759
760         state = rb_entry(node, struct extent_state, rb_node);
761         last_start = state->start;
762         last_end = state->end;
763
764         /*
765          * | ---- desired range ---- |
766          * | state |
767          *
768          * Just lock what we found and keep going
769          */
770         if (state->start == start && state->end <= end) {
771                 set = state->state & bits;
772                 if (set && exclusive) {
773                         *failed_start = state->start;
774                         err = -EEXIST;
775                         goto out;
776                 }
777                 set_state_bits(tree, state, bits);
778                 start = state->end + 1;
779                 merge_state(tree, state);
780                 goto search_again;
781         }
782
783         /*
784          *     | ---- desired range ---- |
785          * | state |
786          *   or
787          * | ------------- state -------------- |
788          *
789          * We need to split the extent we found, and may flip bits on
790          * second half.
791          *
792          * If the extent we found extends past our
793          * range, we just split and search again.  It'll get split
794          * again the next time though.
795          *
796          * If the extent we found is inside our range, we set the
797          * desired bit on it.
798          */
799         if (state->start < start) {
800                 set = state->state & bits;
801                 if (exclusive && set) {
802                         *failed_start = start;
803                         err = -EEXIST;
804                         goto out;
805                 }
806                 err = split_state(tree, state, prealloc, start);
807                 BUG_ON(err == -EEXIST);
808                 prealloc = NULL;
809                 if (err)
810                         goto out;
811                 if (state->end <= end) {
812                         set_state_bits(tree, state, bits);
813                         start = state->end + 1;
814                         merge_state(tree, state);
815                 } else {
816                         start = state->start;
817                 }
818                 goto search_again;
819         }
820         /*
821          * | ---- desired range ---- |
822          *     | state | or               | state |
823          *
824          * There's a hole, we need to insert something in it and
825          * ignore the extent we found.
826          */
827         if (state->start > start) {
828                 u64 this_end;
829                 if (end < last_start)
830                         this_end = end;
831                 else
832                         this_end = last_start -1;
833                 err = insert_state(tree, prealloc, start, this_end,
834                                    bits);
835                 prealloc = NULL;
836                 BUG_ON(err == -EEXIST);
837                 if (err)
838                         goto out;
839                 start = this_end + 1;
840                 goto search_again;
841         }
842         /*
843          * | ---- desired range ---- |
844          *                        | state |
845          * We need to split the extent, and set the bit
846          * on the first half
847          */
848         if (state->start <= end && state->end > end) {
849                 set = state->state & bits;
850                 if (exclusive && set) {
851                         *failed_start = start;
852                         err = -EEXIST;
853                         goto out;
854                 }
855                 err = split_state(tree, state, prealloc, end + 1);
856                 BUG_ON(err == -EEXIST);
857
858                 set_state_bits(tree, prealloc, bits);
859                 merge_state(tree, prealloc);
860                 prealloc = NULL;
861                 goto out;
862         }
863
864         goto search_again;
865
866 out:
867         write_unlock_irqrestore(&tree->lock, flags);
868         if (prealloc)
869                 free_extent_state(prealloc);
870
871         return err;
872
873 search_again:
874         if (start > end)
875                 goto out;
876         write_unlock_irqrestore(&tree->lock, flags);
877         if (mask & __GFP_WAIT)
878                 cond_resched();
879         goto again;
880 }
881 EXPORT_SYMBOL(set_extent_bit);
882
883 /* wrappers around set/clear extent bit */
884 int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
885                      gfp_t mask)
886 {
887         return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
888                               mask);
889 }
890 EXPORT_SYMBOL(set_extent_dirty);
891
892 int set_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
893                     int bits, gfp_t mask)
894 {
895         return set_extent_bit(tree, start, end, bits, 0, NULL,
896                               mask);
897 }
898 EXPORT_SYMBOL(set_extent_bits);
899
900 int clear_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
901                       int bits, gfp_t mask)
902 {
903         return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
904 }
905 EXPORT_SYMBOL(clear_extent_bits);
906
907 int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end,
908                      gfp_t mask)
909 {
910         return set_extent_bit(tree, start, end,
911                               EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
912                               mask);
913 }
914 EXPORT_SYMBOL(set_extent_delalloc);
915
916 int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
917                        gfp_t mask)
918 {
919         return clear_extent_bit(tree, start, end,
920                                 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
921 }
922 EXPORT_SYMBOL(clear_extent_dirty);
923
924 int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
925                      gfp_t mask)
926 {
927         return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
928                               mask);
929 }
930 EXPORT_SYMBOL(set_extent_new);
931
932 int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
933                        gfp_t mask)
934 {
935         return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
936 }
937 EXPORT_SYMBOL(clear_extent_new);
938
939 int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
940                         gfp_t mask)
941 {
942         return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
943                               mask);
944 }
945 EXPORT_SYMBOL(set_extent_uptodate);
946
947 int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
948                           gfp_t mask)
949 {
950         return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
951 }
952 EXPORT_SYMBOL(clear_extent_uptodate);
953
954 int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
955                          gfp_t mask)
956 {
957         return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
958                               0, NULL, mask);
959 }
960 EXPORT_SYMBOL(set_extent_writeback);
961
962 int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
963                            gfp_t mask)
964 {
965         return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
966 }
967 EXPORT_SYMBOL(clear_extent_writeback);
968
969 int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end)
970 {
971         return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
972 }
973 EXPORT_SYMBOL(wait_on_extent_writeback);
974
975 /*
976  * locks a range in ascending order, waiting for any locked regions
977  * it hits on the way.  [start,end] are inclusive, and this will sleep.
978  */
979 int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask)
980 {
981         int err;
982         u64 failed_start;
983         while (1) {
984                 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
985                                      &failed_start, mask);
986                 if (err == -EEXIST && (mask & __GFP_WAIT)) {
987                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
988                         start = failed_start;
989                 } else {
990                         break;
991                 }
992                 WARN_ON(start > end);
993         }
994         return err;
995 }
996 EXPORT_SYMBOL(lock_extent);
997
998 int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end,
999                   gfp_t mask)
1000 {
1001         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
1002 }
1003 EXPORT_SYMBOL(unlock_extent);
1004
1005 /*
1006  * helper function to set pages and extents in the tree dirty
1007  */
1008 int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end)
1009 {
1010         unsigned long index = start >> PAGE_CACHE_SHIFT;
1011         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1012         struct page *page;
1013
1014         while (index <= end_index) {
1015                 page = find_get_page(tree->mapping, index);
1016                 BUG_ON(!page);
1017                 __set_page_dirty_nobuffers(page);
1018                 page_cache_release(page);
1019                 index++;
1020         }
1021         set_extent_dirty(tree, start, end, GFP_NOFS);
1022         return 0;
1023 }
1024 EXPORT_SYMBOL(set_range_dirty);
1025
1026 /*
1027  * helper function to set both pages and extents in the tree writeback
1028  */
1029 int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end)
1030 {
1031         unsigned long index = start >> PAGE_CACHE_SHIFT;
1032         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1033         struct page *page;
1034
1035         while (index <= end_index) {
1036                 page = find_get_page(tree->mapping, index);
1037                 BUG_ON(!page);
1038                 set_page_writeback(page);
1039                 page_cache_release(page);
1040                 index++;
1041         }
1042         set_extent_writeback(tree, start, end, GFP_NOFS);
1043         return 0;
1044 }
1045 EXPORT_SYMBOL(set_range_writeback);
1046
1047 int find_first_extent_bit(struct extent_map_tree *tree, u64 start,
1048                           u64 *start_ret, u64 *end_ret, int bits)
1049 {
1050         struct rb_node *node;
1051         struct extent_state *state;
1052         int ret = 1;
1053
1054         read_lock_irq(&tree->lock);
1055         /*
1056          * this search will find all the extents that end after
1057          * our range starts.
1058          */
1059         node = tree_search(&tree->state, start);
1060         if (!node || IS_ERR(node)) {
1061                 goto out;
1062         }
1063
1064         while(1) {
1065                 state = rb_entry(node, struct extent_state, rb_node);
1066                 if (state->end >= start && (state->state & bits)) {
1067                         *start_ret = state->start;
1068                         *end_ret = state->end;
1069                         ret = 0;
1070                         break;
1071                 }
1072                 node = rb_next(node);
1073                 if (!node)
1074                         break;
1075         }
1076 out:
1077         read_unlock_irq(&tree->lock);
1078         return ret;
1079 }
1080 EXPORT_SYMBOL(find_first_extent_bit);
1081
1082 u64 find_lock_delalloc_range(struct extent_map_tree *tree,
1083                              u64 *start, u64 *end, u64 max_bytes)
1084 {
1085         struct rb_node *node;
1086         struct extent_state *state;
1087         u64 cur_start = *start;
1088         u64 found = 0;
1089         u64 total_bytes = 0;
1090
1091         write_lock_irq(&tree->lock);
1092         /*
1093          * this search will find all the extents that end after
1094          * our range starts.
1095          */
1096 search_again:
1097         node = tree_search(&tree->state, cur_start);
1098         if (!node || IS_ERR(node)) {
1099                 *end = (u64)-1;
1100                 goto out;
1101         }
1102
1103         while(1) {
1104                 state = rb_entry(node, struct extent_state, rb_node);
1105                 if (found && state->start != cur_start) {
1106                         goto out;
1107                 }
1108                 if (!(state->state & EXTENT_DELALLOC)) {
1109                         if (!found)
1110                                 *end = state->end;
1111                         goto out;
1112                 }
1113                 if (!found) {
1114                         struct extent_state *prev_state;
1115                         struct rb_node *prev_node = node;
1116                         while(1) {
1117                                 prev_node = rb_prev(prev_node);
1118                                 if (!prev_node)
1119                                         break;
1120                                 prev_state = rb_entry(prev_node,
1121                                                       struct extent_state,
1122                                                       rb_node);
1123                                 if (!(prev_state->state & EXTENT_DELALLOC))
1124                                         break;
1125                                 state = prev_state;
1126                                 node = prev_node;
1127                         }
1128                 }
1129                 if (state->state & EXTENT_LOCKED) {
1130                         DEFINE_WAIT(wait);
1131                         atomic_inc(&state->refs);
1132                         prepare_to_wait(&state->wq, &wait,
1133                                         TASK_UNINTERRUPTIBLE);
1134                         write_unlock_irq(&tree->lock);
1135                         schedule();
1136                         write_lock_irq(&tree->lock);
1137                         finish_wait(&state->wq, &wait);
1138                         free_extent_state(state);
1139                         goto search_again;
1140                 }
1141                 state->state |= EXTENT_LOCKED;
1142                 if (!found)
1143                         *start = state->start;
1144                 found++;
1145                 *end = state->end;
1146                 cur_start = state->end + 1;
1147                 node = rb_next(node);
1148                 if (!node)
1149                         break;
1150                 total_bytes += state->end - state->start + 1;
1151                 if (total_bytes >= max_bytes)
1152                         break;
1153         }
1154 out:
1155         write_unlock_irq(&tree->lock);
1156         return found;
1157 }
1158
1159 u64 count_range_bits(struct extent_map_tree *tree,
1160                      u64 *start, u64 search_end, u64 max_bytes,
1161                      unsigned long bits)
1162 {
1163         struct rb_node *node;
1164         struct extent_state *state;
1165         u64 cur_start = *start;
1166         u64 total_bytes = 0;
1167         int found = 0;
1168
1169         if (search_end <= cur_start) {
1170                 printk("search_end %Lu start %Lu\n", search_end, cur_start);
1171                 WARN_ON(1);
1172                 return 0;
1173         }
1174
1175         write_lock_irq(&tree->lock);
1176         if (cur_start == 0 && bits == EXTENT_DIRTY) {
1177                 total_bytes = tree->dirty_bytes;
1178                 goto out;
1179         }
1180         /*
1181          * this search will find all the extents that end after
1182          * our range starts.
1183          */
1184         node = tree_search(&tree->state, cur_start);
1185         if (!node || IS_ERR(node)) {
1186                 goto out;
1187         }
1188
1189         while(1) {
1190                 state = rb_entry(node, struct extent_state, rb_node);
1191                 if (state->start > search_end)
1192                         break;
1193                 if (state->end >= cur_start && (state->state & bits)) {
1194                         total_bytes += min(search_end, state->end) + 1 -
1195                                        max(cur_start, state->start);
1196                         if (total_bytes >= max_bytes)
1197                                 break;
1198                         if (!found) {
1199                                 *start = state->start;
1200                                 found = 1;
1201                         }
1202                 }
1203                 node = rb_next(node);
1204                 if (!node)
1205                         break;
1206         }
1207 out:
1208         write_unlock_irq(&tree->lock);
1209         return total_bytes;
1210 }
1211 /*
1212  * helper function to lock both pages and extents in the tree.
1213  * pages must be locked first.
1214  */
1215 int lock_range(struct extent_map_tree *tree, u64 start, u64 end)
1216 {
1217         unsigned long index = start >> PAGE_CACHE_SHIFT;
1218         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1219         struct page *page;
1220         int err;
1221
1222         while (index <= end_index) {
1223                 page = grab_cache_page(tree->mapping, index);
1224                 if (!page) {
1225                         err = -ENOMEM;
1226                         goto failed;
1227                 }
1228                 if (IS_ERR(page)) {
1229                         err = PTR_ERR(page);
1230                         goto failed;
1231                 }
1232                 index++;
1233         }
1234         lock_extent(tree, start, end, GFP_NOFS);
1235         return 0;
1236
1237 failed:
1238         /*
1239          * we failed above in getting the page at 'index', so we undo here
1240          * up to but not including the page at 'index'
1241          */
1242         end_index = index;
1243         index = start >> PAGE_CACHE_SHIFT;
1244         while (index < end_index) {
1245                 page = find_get_page(tree->mapping, index);
1246                 unlock_page(page);
1247                 page_cache_release(page);
1248                 index++;
1249         }
1250         return err;
1251 }
1252 EXPORT_SYMBOL(lock_range);
1253
1254 /*
1255  * helper function to unlock both pages and extents in the tree.
1256  */
1257 int unlock_range(struct extent_map_tree *tree, u64 start, u64 end)
1258 {
1259         unsigned long index = start >> PAGE_CACHE_SHIFT;
1260         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1261         struct page *page;
1262
1263         while (index <= end_index) {
1264                 page = find_get_page(tree->mapping, index);
1265                 unlock_page(page);
1266                 page_cache_release(page);
1267                 index++;
1268         }
1269         unlock_extent(tree, start, end, GFP_NOFS);
1270         return 0;
1271 }
1272 EXPORT_SYMBOL(unlock_range);
1273
1274 int set_state_private(struct extent_map_tree *tree, u64 start, u64 private)
1275 {
1276         struct rb_node *node;
1277         struct extent_state *state;
1278         int ret = 0;
1279
1280         write_lock_irq(&tree->lock);
1281         /*
1282          * this search will find all the extents that end after
1283          * our range starts.
1284          */
1285         node = tree_search(&tree->state, start);
1286         if (!node || IS_ERR(node)) {
1287                 ret = -ENOENT;
1288                 goto out;
1289         }
1290         state = rb_entry(node, struct extent_state, rb_node);
1291         if (state->start != start) {
1292                 ret = -ENOENT;
1293                 goto out;
1294         }
1295         state->private = private;
1296 out:
1297         write_unlock_irq(&tree->lock);
1298         return ret;
1299 }
1300
1301 int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private)
1302 {
1303         struct rb_node *node;
1304         struct extent_state *state;
1305         int ret = 0;
1306
1307         read_lock_irq(&tree->lock);
1308         /*
1309          * this search will find all the extents that end after
1310          * our range starts.
1311          */
1312         node = tree_search(&tree->state, start);
1313         if (!node || IS_ERR(node)) {
1314                 ret = -ENOENT;
1315                 goto out;
1316         }
1317         state = rb_entry(node, struct extent_state, rb_node);
1318         if (state->start != start) {
1319                 ret = -ENOENT;
1320                 goto out;
1321         }
1322         *private = state->private;
1323 out:
1324         read_unlock_irq(&tree->lock);
1325         return ret;
1326 }
1327
1328 /*
1329  * searches a range in the state tree for a given mask.
1330  * If 'filled' == 1, this returns 1 only if ever extent in the tree
1331  * has the bits set.  Otherwise, 1 is returned if any bit in the
1332  * range is found set.
1333  */
1334 int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
1335                    int bits, int filled)
1336 {
1337         struct extent_state *state = NULL;
1338         struct rb_node *node;
1339         int bitset = 0;
1340
1341         read_lock_irq(&tree->lock);
1342         node = tree_search(&tree->state, start);
1343         while (node && start <= end) {
1344                 state = rb_entry(node, struct extent_state, rb_node);
1345
1346                 if (filled && state->start > start) {
1347                         bitset = 0;
1348                         break;
1349                 }
1350
1351                 if (state->start > end)
1352                         break;
1353
1354                 if (state->state & bits) {
1355                         bitset = 1;
1356                         if (!filled)
1357                                 break;
1358                 } else if (filled) {
1359                         bitset = 0;
1360                         break;
1361                 }
1362                 start = state->end + 1;
1363                 if (start > end)
1364                         break;
1365                 node = rb_next(node);
1366                 if (!node) {
1367                         if (filled)
1368                                 bitset = 0;
1369                         break;
1370                 }
1371         }
1372         read_unlock_irq(&tree->lock);
1373         return bitset;
1374 }
1375 EXPORT_SYMBOL(test_range_bit);
1376
1377 /*
1378  * helper function to set a given page up to date if all the
1379  * extents in the tree for that page are up to date
1380  */
1381 static int check_page_uptodate(struct extent_map_tree *tree,
1382                                struct page *page)
1383 {
1384         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1385         u64 end = start + PAGE_CACHE_SIZE - 1;
1386         if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1387                 SetPageUptodate(page);
1388         return 0;
1389 }
1390
1391 /*
1392  * helper function to unlock a page if all the extents in the tree
1393  * for that page are unlocked
1394  */
1395 static int check_page_locked(struct extent_map_tree *tree,
1396                              struct page *page)
1397 {
1398         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1399         u64 end = start + PAGE_CACHE_SIZE - 1;
1400         if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1401                 unlock_page(page);
1402         return 0;
1403 }
1404
1405 /*
1406  * helper function to end page writeback if all the extents
1407  * in the tree for that page are done with writeback
1408  */
1409 static int check_page_writeback(struct extent_map_tree *tree,
1410                              struct page *page)
1411 {
1412         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1413         u64 end = start + PAGE_CACHE_SIZE - 1;
1414         if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1415                 end_page_writeback(page);
1416         return 0;
1417 }
1418
1419 /* lots and lots of room for performance fixes in the end_bio funcs */
1420
1421 /*
1422  * after a writepage IO is done, we need to:
1423  * clear the uptodate bits on error
1424  * clear the writeback bits in the extent tree for this IO
1425  * end_page_writeback if the page has no more pending IO
1426  *
1427  * Scheduling is not allowed, so the extent state tree is expected
1428  * to have one and only one object corresponding to this IO.
1429  */
1430 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1431 static void end_bio_extent_writepage(struct bio *bio, int err)
1432 #else
1433 static int end_bio_extent_writepage(struct bio *bio,
1434                                    unsigned int bytes_done, int err)
1435 #endif
1436 {
1437         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1438         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1439         struct extent_map_tree *tree = bio->bi_private;
1440         u64 start;
1441         u64 end;
1442         int whole_page;
1443
1444 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1445         if (bio->bi_size)
1446                 return 1;
1447 #endif
1448
1449         do {
1450                 struct page *page = bvec->bv_page;
1451                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1452                          bvec->bv_offset;
1453                 end = start + bvec->bv_len - 1;
1454
1455                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1456                         whole_page = 1;
1457                 else
1458                         whole_page = 0;
1459
1460                 if (--bvec >= bio->bi_io_vec)
1461                         prefetchw(&bvec->bv_page->flags);
1462
1463                 if (!uptodate) {
1464                         clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1465                         ClearPageUptodate(page);
1466                         SetPageError(page);
1467                 }
1468                 clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1469
1470                 if (whole_page)
1471                         end_page_writeback(page);
1472                 else
1473                         check_page_writeback(tree, page);
1474                 if (tree->ops && tree->ops->writepage_end_io_hook)
1475                         tree->ops->writepage_end_io_hook(page, start, end);
1476         } while (bvec >= bio->bi_io_vec);
1477
1478         bio_put(bio);
1479 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1480         return 0;
1481 #endif
1482 }
1483
1484 /*
1485  * after a readpage IO is done, we need to:
1486  * clear the uptodate bits on error
1487  * set the uptodate bits if things worked
1488  * set the page up to date if all extents in the tree are uptodate
1489  * clear the lock bit in the extent tree
1490  * unlock the page if there are no other extents locked for it
1491  *
1492  * Scheduling is not allowed, so the extent state tree is expected
1493  * to have one and only one object corresponding to this IO.
1494  */
1495 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1496 static void end_bio_extent_readpage(struct bio *bio, int err)
1497 #else
1498 static int end_bio_extent_readpage(struct bio *bio,
1499                                    unsigned int bytes_done, int err)
1500 #endif
1501 {
1502         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1503         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1504         struct extent_map_tree *tree = bio->bi_private;
1505         u64 start;
1506         u64 end;
1507         int whole_page;
1508         int ret;
1509
1510 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1511         if (bio->bi_size)
1512                 return 1;
1513 #endif
1514
1515         do {
1516                 struct page *page = bvec->bv_page;
1517                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1518                         bvec->bv_offset;
1519                 end = start + bvec->bv_len - 1;
1520
1521                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1522                         whole_page = 1;
1523                 else
1524                         whole_page = 0;
1525
1526                 if (--bvec >= bio->bi_io_vec)
1527                         prefetchw(&bvec->bv_page->flags);
1528
1529                 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1530                         ret = tree->ops->readpage_end_io_hook(page, start, end);
1531                         if (ret)
1532                                 uptodate = 0;
1533                 }
1534                 if (uptodate) {
1535                         set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1536                         if (whole_page)
1537                                 SetPageUptodate(page);
1538                         else
1539                                 check_page_uptodate(tree, page);
1540                 } else {
1541                         ClearPageUptodate(page);
1542                         SetPageError(page);
1543                 }
1544
1545                 unlock_extent(tree, start, end, GFP_ATOMIC);
1546
1547                 if (whole_page)
1548                         unlock_page(page);
1549                 else
1550                         check_page_locked(tree, page);
1551         } while (bvec >= bio->bi_io_vec);
1552
1553         bio_put(bio);
1554 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1555         return 0;
1556 #endif
1557 }
1558
1559 /*
1560  * IO done from prepare_write is pretty simple, we just unlock
1561  * the structs in the extent tree when done, and set the uptodate bits
1562  * as appropriate.
1563  */
1564 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1565 static void end_bio_extent_preparewrite(struct bio *bio, int err)
1566 #else
1567 static int end_bio_extent_preparewrite(struct bio *bio,
1568                                        unsigned int bytes_done, int err)
1569 #endif
1570 {
1571         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1572         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1573         struct extent_map_tree *tree = bio->bi_private;
1574         u64 start;
1575         u64 end;
1576
1577 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1578         if (bio->bi_size)
1579                 return 1;
1580 #endif
1581
1582         do {
1583                 struct page *page = bvec->bv_page;
1584                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1585                         bvec->bv_offset;
1586                 end = start + bvec->bv_len - 1;
1587
1588                 if (--bvec >= bio->bi_io_vec)
1589                         prefetchw(&bvec->bv_page->flags);
1590
1591                 if (uptodate) {
1592                         set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1593                 } else {
1594                         ClearPageUptodate(page);
1595                         SetPageError(page);
1596                 }
1597
1598                 unlock_extent(tree, start, end, GFP_ATOMIC);
1599
1600         } while (bvec >= bio->bi_io_vec);
1601
1602         bio_put(bio);
1603 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1604         return 0;
1605 #endif
1606 }
1607
1608 static struct bio *
1609 extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1610                  gfp_t gfp_flags)
1611 {
1612         struct bio *bio;
1613
1614         bio = bio_alloc(gfp_flags, nr_vecs);
1615
1616         if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1617                 while (!bio && (nr_vecs /= 2))
1618                         bio = bio_alloc(gfp_flags, nr_vecs);
1619         }
1620
1621         if (bio) {
1622                 bio->bi_bdev = bdev;
1623                 bio->bi_sector = first_sector;
1624         }
1625         return bio;
1626 }
1627
1628 static int submit_one_bio(int rw, struct bio *bio)
1629 {
1630         u64 maxsector;
1631         int ret = 0;
1632
1633         bio_get(bio);
1634
1635         maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
1636         if (maxsector < bio->bi_sector) {
1637                 printk("sector too large max %Lu got %llu\n", maxsector,
1638                         (unsigned long long)bio->bi_sector);
1639                 WARN_ON(1);
1640         }
1641
1642         submit_bio(rw, bio);
1643         if (bio_flagged(bio, BIO_EOPNOTSUPP))
1644                 ret = -EOPNOTSUPP;
1645         bio_put(bio);
1646         return ret;
1647 }
1648
1649 static int submit_extent_page(int rw, struct extent_map_tree *tree,
1650                               struct page *page, sector_t sector,
1651                               size_t size, unsigned long offset,
1652                               struct block_device *bdev,
1653                               struct bio **bio_ret,
1654                               unsigned long max_pages,
1655                               bio_end_io_t end_io_func)
1656 {
1657         int ret = 0;
1658         struct bio *bio;
1659         int nr;
1660
1661         if (bio_ret && *bio_ret) {
1662                 bio = *bio_ret;
1663                 if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
1664                     bio_add_page(bio, page, size, offset) < size) {
1665                         ret = submit_one_bio(rw, bio);
1666                         bio = NULL;
1667                 } else {
1668                         return 0;
1669                 }
1670         }
1671         nr = min_t(int, max_pages, bio_get_nr_vecs(bdev));
1672         bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1673         if (!bio) {
1674                 printk("failed to allocate bio nr %d\n", nr);
1675         }
1676         bio_add_page(bio, page, size, offset);
1677         bio->bi_end_io = end_io_func;
1678         bio->bi_private = tree;
1679         if (bio_ret) {
1680                 *bio_ret = bio;
1681         } else {
1682                 ret = submit_one_bio(rw, bio);
1683         }
1684
1685         return ret;
1686 }
1687
1688 void set_page_extent_mapped(struct page *page)
1689 {
1690         if (!PagePrivate(page)) {
1691                 SetPagePrivate(page);
1692                 WARN_ON(!page->mapping->a_ops->invalidatepage);
1693                 set_page_private(page, EXTENT_PAGE_PRIVATE);
1694                 page_cache_get(page);
1695         }
1696 }
1697
1698 void set_page_extent_head(struct page *page, unsigned long len)
1699 {
1700         set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1701 }
1702
1703 /*
1704  * basic readpage implementation.  Locked extent state structs are inserted
1705  * into the tree that are removed when the IO is done (by the end_io
1706  * handlers)
1707  */
1708 static int __extent_read_full_page(struct extent_map_tree *tree,
1709                                    struct page *page,
1710                                    get_extent_t *get_extent,
1711                                    struct bio **bio)
1712 {
1713         struct inode *inode = page->mapping->host;
1714         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1715         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1716         u64 end;
1717         u64 cur = start;
1718         u64 extent_offset;
1719         u64 last_byte = i_size_read(inode);
1720         u64 block_start;
1721         u64 cur_end;
1722         sector_t sector;
1723         struct extent_map *em;
1724         struct block_device *bdev;
1725         int ret;
1726         int nr = 0;
1727         size_t page_offset = 0;
1728         size_t iosize;
1729         size_t blocksize = inode->i_sb->s_blocksize;
1730
1731         set_page_extent_mapped(page);
1732
1733         end = page_end;
1734         lock_extent(tree, start, end, GFP_NOFS);
1735
1736         while (cur <= end) {
1737                 if (cur >= last_byte) {
1738                         char *userpage;
1739                         iosize = PAGE_CACHE_SIZE - page_offset;
1740                         userpage = kmap_atomic(page, KM_USER0);
1741                         memset(userpage + page_offset, 0, iosize);
1742                         flush_dcache_page(page);
1743                         kunmap_atomic(userpage, KM_USER0);
1744                         set_extent_uptodate(tree, cur, cur + iosize - 1,
1745                                             GFP_NOFS);
1746                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1747                         break;
1748                 }
1749                 em = get_extent(inode, page, page_offset, cur, end, 0);
1750                 if (IS_ERR(em) || !em) {
1751                         SetPageError(page);
1752                         unlock_extent(tree, cur, end, GFP_NOFS);
1753                         break;
1754                 }
1755
1756                 extent_offset = cur - em->start;
1757                 BUG_ON(em->end < cur);
1758                 BUG_ON(end < cur);
1759
1760                 iosize = min(em->end - cur, end - cur) + 1;
1761                 cur_end = min(em->end, end);
1762                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1763                 sector = (em->block_start + extent_offset) >> 9;
1764                 bdev = em->bdev;
1765                 block_start = em->block_start;
1766                 free_extent_map(em);
1767                 em = NULL;
1768
1769                 /* we've found a hole, just zero and go on */
1770                 if (block_start == EXTENT_MAP_HOLE) {
1771                         char *userpage;
1772                         userpage = kmap_atomic(page, KM_USER0);
1773                         memset(userpage + page_offset, 0, iosize);
1774                         flush_dcache_page(page);
1775                         kunmap_atomic(userpage, KM_USER0);
1776
1777                         set_extent_uptodate(tree, cur, cur + iosize - 1,
1778                                             GFP_NOFS);
1779                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1780                         cur = cur + iosize;
1781                         page_offset += iosize;
1782                         continue;
1783                 }
1784                 /* the get_extent function already copied into the page */
1785                 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
1786                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1787                         cur = cur + iosize;
1788                         page_offset += iosize;
1789                         continue;
1790                 }
1791
1792                 ret = 0;
1793                 if (tree->ops && tree->ops->readpage_io_hook) {
1794                         ret = tree->ops->readpage_io_hook(page, cur,
1795                                                           cur + iosize - 1);
1796                 }
1797                 if (!ret) {
1798                         unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
1799                         nr -= page->index;
1800                         ret = submit_extent_page(READ, tree, page,
1801                                          sector, iosize, page_offset,
1802                                          bdev, bio, nr,
1803                                          end_bio_extent_readpage);
1804                 }
1805                 if (ret)
1806                         SetPageError(page);
1807                 cur = cur + iosize;
1808                 page_offset += iosize;
1809                 nr++;
1810         }
1811         if (!nr) {
1812                 if (!PageError(page))
1813                         SetPageUptodate(page);
1814                 unlock_page(page);
1815         }
1816         return 0;
1817 }
1818
1819 int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
1820                             get_extent_t *get_extent)
1821 {
1822         struct bio *bio = NULL;
1823         int ret;
1824
1825         ret = __extent_read_full_page(tree, page, get_extent, &bio);
1826         if (bio)
1827                 submit_one_bio(READ, bio);
1828         return ret;
1829 }
1830 EXPORT_SYMBOL(extent_read_full_page);
1831
1832 /*
1833  * the writepage semantics are similar to regular writepage.  extent
1834  * records are inserted to lock ranges in the tree, and as dirty areas
1835  * are found, they are marked writeback.  Then the lock bits are removed
1836  * and the end_io handler clears the writeback ranges
1837  */
1838 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
1839                               void *data)
1840 {
1841         struct inode *inode = page->mapping->host;
1842         struct extent_page_data *epd = data;
1843         struct extent_map_tree *tree = epd->tree;
1844         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1845         u64 delalloc_start;
1846         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1847         u64 end;
1848         u64 cur = start;
1849         u64 extent_offset;
1850         u64 last_byte = i_size_read(inode);
1851         u64 block_start;
1852         u64 iosize;
1853         sector_t sector;
1854         struct extent_map *em;
1855         struct block_device *bdev;
1856         int ret;
1857         int nr = 0;
1858         size_t page_offset = 0;
1859         size_t blocksize;
1860         loff_t i_size = i_size_read(inode);
1861         unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
1862         u64 nr_delalloc;
1863         u64 delalloc_end;
1864
1865         WARN_ON(!PageLocked(page));
1866         if (page->index > end_index) {
1867                 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1868                 unlock_page(page);
1869                 return 0;
1870         }
1871
1872         if (page->index == end_index) {
1873                 char *userpage;
1874
1875                 size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
1876
1877                 userpage = kmap_atomic(page, KM_USER0);
1878                 memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset);
1879                 flush_dcache_page(page);
1880                 kunmap_atomic(userpage, KM_USER0);
1881         }
1882
1883         set_page_extent_mapped(page);
1884
1885         delalloc_start = start;
1886         delalloc_end = 0;
1887         while(delalloc_end < page_end) {
1888                 nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
1889                                                        &delalloc_end,
1890                                                        128 * 1024 * 1024);
1891                 if (nr_delalloc == 0) {
1892                         delalloc_start = delalloc_end + 1;
1893                         continue;
1894                 }
1895                 tree->ops->fill_delalloc(inode, delalloc_start,
1896                                          delalloc_end);
1897                 clear_extent_bit(tree, delalloc_start,
1898                                  delalloc_end,
1899                                  EXTENT_LOCKED | EXTENT_DELALLOC,
1900                                  1, 0, GFP_NOFS);
1901                 delalloc_start = delalloc_end + 1;
1902         }
1903         lock_extent(tree, start, page_end, GFP_NOFS);
1904
1905         end = page_end;
1906         if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1907                 printk("found delalloc bits after lock_extent\n");
1908         }
1909
1910         if (last_byte <= start) {
1911                 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1912                 goto done;
1913         }
1914
1915         set_extent_uptodate(tree, start, page_end, GFP_NOFS);
1916         blocksize = inode->i_sb->s_blocksize;
1917
1918         while (cur <= end) {
1919                 if (cur >= last_byte) {
1920                         clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
1921                         break;
1922                 }
1923                 em = epd->get_extent(inode, page, page_offset, cur, end, 1);
1924                 if (IS_ERR(em) || !em) {
1925                         SetPageError(page);
1926                         break;
1927                 }
1928
1929                 extent_offset = cur - em->start;
1930                 BUG_ON(em->end < cur);
1931                 BUG_ON(end < cur);
1932                 iosize = min(em->end - cur, end - cur) + 1;
1933                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1934                 sector = (em->block_start + extent_offset) >> 9;
1935                 bdev = em->bdev;
1936                 block_start = em->block_start;
1937                 free_extent_map(em);
1938                 em = NULL;
1939
1940                 if (block_start == EXTENT_MAP_HOLE ||
1941                     block_start == EXTENT_MAP_INLINE) {
1942                         clear_extent_dirty(tree, cur,
1943                                            cur + iosize - 1, GFP_NOFS);
1944                         cur = cur + iosize;
1945                         page_offset += iosize;
1946                         continue;
1947                 }
1948
1949                 /* leave this out until we have a page_mkwrite call */
1950                 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
1951                                    EXTENT_DIRTY, 0)) {
1952                         cur = cur + iosize;
1953                         page_offset += iosize;
1954                         continue;
1955                 }
1956                 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
1957                 if (tree->ops && tree->ops->writepage_io_hook) {
1958                         ret = tree->ops->writepage_io_hook(page, cur,
1959                                                 cur + iosize - 1);
1960                 } else {
1961                         ret = 0;
1962                 }
1963                 if (ret)
1964                         SetPageError(page);
1965                 else {
1966                         unsigned long max_nr = end_index + 1;
1967                         set_range_writeback(tree, cur, cur + iosize - 1);
1968                         if (!PageWriteback(page)) {
1969                                 printk("warning page %lu not writeback, "
1970                                        "cur %llu end %llu\n", page->index,
1971                                        (unsigned long long)cur,
1972                                        (unsigned long long)end);
1973                         }
1974
1975                         ret = submit_extent_page(WRITE, tree, page, sector,
1976                                                  iosize, page_offset, bdev,
1977                                                  &epd->bio, max_nr,
1978                                                  end_bio_extent_writepage);
1979                         if (ret)
1980                                 SetPageError(page);
1981                 }
1982                 cur = cur + iosize;
1983                 page_offset += iosize;
1984                 nr++;
1985         }
1986 done:
1987         if (nr == 0) {
1988                 /* make sure the mapping tag for page dirty gets cleared */
1989                 set_page_writeback(page);
1990                 end_page_writeback(page);
1991         }
1992         unlock_extent(tree, start, page_end, GFP_NOFS);
1993         unlock_page(page);
1994         return 0;
1995 }
1996
1997 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
1998
1999 /* Taken directly from 2.6.23 for 2.6.18 back port */
2000 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
2001                                 void *data);
2002
2003 /**
2004  * write_cache_pages - walk the list of dirty pages of the given address space
2005  * and write all of them.
2006  * @mapping: address space structure to write
2007  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2008  * @writepage: function called for each page
2009  * @data: data passed to writepage function
2010  *
2011  * If a page is already under I/O, write_cache_pages() skips it, even
2012  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2013  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2014  * and msync() need to guarantee that all the data which was dirty at the time
2015  * the call was made get new I/O started against them.  If wbc->sync_mode is
2016  * WB_SYNC_ALL then we were called for data integrity and we must wait for
2017  * existing IO to complete.
2018  */
2019 static int write_cache_pages(struct address_space *mapping,
2020                       struct writeback_control *wbc, writepage_t writepage,
2021                       void *data)
2022 {
2023         struct backing_dev_info *bdi = mapping->backing_dev_info;
2024         int ret = 0;
2025         int done = 0;
2026         struct pagevec pvec;
2027         int nr_pages;
2028         pgoff_t index;
2029         pgoff_t end;            /* Inclusive */
2030         int scanned = 0;
2031         int range_whole = 0;
2032
2033         if (wbc->nonblocking && bdi_write_congested(bdi)) {
2034                 wbc->encountered_congestion = 1;
2035                 return 0;
2036         }
2037
2038         pagevec_init(&pvec, 0);
2039         if (wbc->range_cyclic) {
2040                 index = mapping->writeback_index; /* Start from prev offset */
2041                 end = -1;
2042         } else {
2043                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2044                 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2045                 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2046                         range_whole = 1;
2047                 scanned = 1;
2048         }
2049 retry:
2050         while (!done && (index <= end) &&
2051                (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2052                                               PAGECACHE_TAG_DIRTY,
2053                                               min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2054                 unsigned i;
2055
2056                 scanned = 1;
2057                 for (i = 0; i < nr_pages; i++) {
2058                         struct page *page = pvec.pages[i];
2059
2060                         /*
2061                          * At this point we hold neither mapping->tree_lock nor
2062                          * lock on the page itself: the page may be truncated or
2063                          * invalidated (changing page->mapping to NULL), or even
2064                          * swizzled back from swapper_space to tmpfs file
2065                          * mapping
2066                          */
2067                         lock_page(page);
2068
2069                         if (unlikely(page->mapping != mapping)) {
2070                                 unlock_page(page);
2071                                 continue;
2072                         }
2073
2074                         if (!wbc->range_cyclic && page->index > end) {
2075                                 done = 1;
2076                                 unlock_page(page);
2077                                 continue;
2078                         }
2079
2080                         if (wbc->sync_mode != WB_SYNC_NONE)
2081                                 wait_on_page_writeback(page);
2082
2083                         if (PageWriteback(page) ||
2084                             !clear_page_dirty_for_io(page)) {
2085                                 unlock_page(page);
2086                                 continue;
2087                         }
2088
2089                         ret = (*writepage)(page, wbc, data);
2090
2091                         if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2092                                 unlock_page(page);
2093                                 ret = 0;
2094                         }
2095                         if (ret || (--(wbc->nr_to_write) <= 0))
2096                                 done = 1;
2097                         if (wbc->nonblocking && bdi_write_congested(bdi)) {
2098                                 wbc->encountered_congestion = 1;
2099                                 done = 1;
2100                         }
2101                 }
2102                 pagevec_release(&pvec);
2103                 cond_resched();
2104         }
2105         if (!scanned && !done) {
2106                 /*
2107                  * We hit the last page and there is more work to be done: wrap
2108                  * back to the start of the file
2109                  */
2110                 scanned = 1;
2111                 index = 0;
2112                 goto retry;
2113         }
2114         if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2115                 mapping->writeback_index = index;
2116         return ret;
2117 }
2118 #endif
2119
2120 int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
2121                           get_extent_t *get_extent,
2122                           struct writeback_control *wbc)
2123 {
2124         int ret;
2125         struct address_space *mapping = page->mapping;
2126         struct extent_page_data epd = {
2127                 .bio = NULL,
2128                 .tree = tree,
2129                 .get_extent = get_extent,
2130         };
2131         struct writeback_control wbc_writepages = {
2132                 .bdi            = wbc->bdi,
2133                 .sync_mode      = WB_SYNC_NONE,
2134                 .older_than_this = NULL,
2135                 .nr_to_write    = 64,
2136                 .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
2137                 .range_end      = (loff_t)-1,
2138         };
2139
2140
2141         ret = __extent_writepage(page, wbc, &epd);
2142
2143         write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
2144         if (epd.bio) {
2145                 submit_one_bio(WRITE, epd.bio);
2146         }
2147         return ret;
2148 }
2149 EXPORT_SYMBOL(extent_write_full_page);
2150
2151
2152 int extent_writepages(struct extent_map_tree *tree,
2153                       struct address_space *mapping,
2154                       get_extent_t *get_extent,
2155                       struct writeback_control *wbc)
2156 {
2157         int ret = 0;
2158         struct extent_page_data epd = {
2159                 .bio = NULL,
2160                 .tree = tree,
2161                 .get_extent = get_extent,
2162         };
2163
2164         ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
2165         if (epd.bio) {
2166                 submit_one_bio(WRITE, epd.bio);
2167         }
2168         return ret;
2169 }
2170 EXPORT_SYMBOL(extent_writepages);
2171
2172 int extent_readpages(struct extent_map_tree *tree,
2173                      struct address_space *mapping,
2174                      struct list_head *pages, unsigned nr_pages,
2175                      get_extent_t get_extent)
2176 {
2177         struct bio *bio = NULL;
2178         unsigned page_idx;
2179         struct pagevec pvec;
2180
2181         pagevec_init(&pvec, 0);
2182         for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2183                 struct page *page = list_entry(pages->prev, struct page, lru);
2184
2185                 prefetchw(&page->flags);
2186                 list_del(&page->lru);
2187                 /*
2188                  * what we want to do here is call add_to_page_cache_lru,
2189                  * but that isn't exported, so we reproduce it here
2190                  */
2191                 if (!add_to_page_cache(page, mapping,
2192                                         page->index, GFP_KERNEL)) {
2193
2194                         /* open coding of lru_cache_add, also not exported */
2195                         page_cache_get(page);
2196                         if (!pagevec_add(&pvec, page))
2197                                 __pagevec_lru_add(&pvec);
2198                         __extent_read_full_page(tree, page, get_extent, &bio);
2199                 }
2200                 page_cache_release(page);
2201         }
2202         if (pagevec_count(&pvec))
2203                 __pagevec_lru_add(&pvec);
2204         BUG_ON(!list_empty(pages));
2205         if (bio)
2206                 submit_one_bio(READ, bio);
2207         return 0;
2208 }
2209 EXPORT_SYMBOL(extent_readpages);
2210
2211 /*
2212  * basic invalidatepage code, this waits on any locked or writeback
2213  * ranges corresponding to the page, and then deletes any extent state
2214  * records from the tree
2215  */
2216 int extent_invalidatepage(struct extent_map_tree *tree,
2217                           struct page *page, unsigned long offset)
2218 {
2219         u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2220         u64 end = start + PAGE_CACHE_SIZE - 1;
2221         size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2222
2223         start += (offset + blocksize -1) & ~(blocksize - 1);
2224         if (start > end)
2225                 return 0;
2226
2227         lock_extent(tree, start, end, GFP_NOFS);
2228         wait_on_extent_writeback(tree, start, end);
2229         clear_extent_bit(tree, start, end,
2230                          EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2231                          1, 1, GFP_NOFS);
2232         return 0;
2233 }
2234 EXPORT_SYMBOL(extent_invalidatepage);
2235
2236 /*
2237  * simple commit_write call, set_range_dirty is used to mark both
2238  * the pages and the extent records as dirty
2239  */
2240 int extent_commit_write(struct extent_map_tree *tree,
2241                         struct inode *inode, struct page *page,
2242                         unsigned from, unsigned to)
2243 {
2244         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2245
2246         set_page_extent_mapped(page);
2247         set_page_dirty(page);
2248
2249         if (pos > inode->i_size) {
2250                 i_size_write(inode, pos);
2251                 mark_inode_dirty(inode);
2252         }
2253         return 0;
2254 }
2255 EXPORT_SYMBOL(extent_commit_write);
2256
2257 int extent_prepare_write(struct extent_map_tree *tree,
2258                          struct inode *inode, struct page *page,
2259                          unsigned from, unsigned to, get_extent_t *get_extent)
2260 {
2261         u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2262         u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2263         u64 block_start;
2264         u64 orig_block_start;
2265         u64 block_end;
2266         u64 cur_end;
2267         struct extent_map *em;
2268         unsigned blocksize = 1 << inode->i_blkbits;
2269         size_t page_offset = 0;
2270         size_t block_off_start;
2271         size_t block_off_end;
2272         int err = 0;
2273         int iocount = 0;
2274         int ret = 0;
2275         int isnew;
2276
2277         set_page_extent_mapped(page);
2278
2279         block_start = (page_start + from) & ~((u64)blocksize - 1);
2280         block_end = (page_start + to - 1) | (blocksize - 1);
2281         orig_block_start = block_start;
2282
2283         lock_extent(tree, page_start, page_end, GFP_NOFS);
2284         while(block_start <= block_end) {
2285                 em = get_extent(inode, page, page_offset, block_start,
2286                                 block_end, 1);
2287                 if (IS_ERR(em) || !em) {
2288                         goto err;
2289                 }
2290                 cur_end = min(block_end, em->end);
2291                 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2292                 block_off_end = block_off_start + blocksize;
2293                 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2294
2295                 if (!PageUptodate(page) && isnew &&
2296                     (block_off_end > to || block_off_start < from)) {
2297                         void *kaddr;
2298
2299                         kaddr = kmap_atomic(page, KM_USER0);
2300                         if (block_off_end > to)
2301                                 memset(kaddr + to, 0, block_off_end - to);
2302                         if (block_off_start < from)
2303                                 memset(kaddr + block_off_start, 0,
2304                                        from - block_off_start);
2305                         flush_dcache_page(page);
2306                         kunmap_atomic(kaddr, KM_USER0);
2307                 }
2308                 if ((em->block_start != EXTENT_MAP_HOLE &&
2309                      em->block_start != EXTENT_MAP_INLINE) &&
2310                     !isnew && !PageUptodate(page) &&
2311                     (block_off_end > to || block_off_start < from) &&
2312                     !test_range_bit(tree, block_start, cur_end,
2313                                     EXTENT_UPTODATE, 1)) {
2314                         u64 sector;
2315                         u64 extent_offset = block_start - em->start;
2316                         size_t iosize;
2317                         sector = (em->block_start + extent_offset) >> 9;
2318                         iosize = (cur_end - block_start + blocksize) &
2319                                 ~((u64)blocksize - 1);
2320                         /*
2321                          * we've already got the extent locked, but we
2322                          * need to split the state such that our end_bio
2323                          * handler can clear the lock.
2324                          */
2325                         set_extent_bit(tree, block_start,
2326                                        block_start + iosize - 1,
2327                                        EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2328                         ret = submit_extent_page(READ, tree, page,
2329                                          sector, iosize, page_offset, em->bdev,
2330                                          NULL, 1,
2331                                          end_bio_extent_preparewrite);
2332                         iocount++;
2333                         block_start = block_start + iosize;
2334                 } else {
2335                         set_extent_uptodate(tree, block_start, cur_end,
2336                                             GFP_NOFS);
2337                         unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2338                         block_start = cur_end + 1;
2339                 }
2340                 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2341                 free_extent_map(em);
2342         }
2343         if (iocount) {
2344                 wait_extent_bit(tree, orig_block_start,
2345                                 block_end, EXTENT_LOCKED);
2346         }
2347         check_page_uptodate(tree, page);
2348 err:
2349         /* FIXME, zero out newly allocated blocks on error */
2350         return err;
2351 }
2352 EXPORT_SYMBOL(extent_prepare_write);
2353
2354 /*
2355  * a helper for releasepage.  As long as there are no locked extents
2356  * in the range corresponding to the page, both state records and extent
2357  * map records are removed
2358  */
2359 int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page)
2360 {
2361         struct extent_map *em;
2362         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2363         u64 end = start + PAGE_CACHE_SIZE - 1;
2364         u64 orig_start = start;
2365         int ret = 1;
2366
2367         while (start <= end) {
2368                 em = lookup_extent_mapping(tree, start, end);
2369                 if (!em || IS_ERR(em))
2370                         break;
2371                 if (!test_range_bit(tree, em->start, em->end,
2372                                     EXTENT_LOCKED, 0)) {
2373                         remove_extent_mapping(tree, em);
2374                         /* once for the rb tree */
2375                         free_extent_map(em);
2376                 }
2377                 start = em->end + 1;
2378                 /* once for us */
2379                 free_extent_map(em);
2380         }
2381         if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0))
2382                 ret = 0;
2383         else
2384                 clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
2385                                  1, 1, GFP_NOFS);
2386         return ret;
2387 }
2388 EXPORT_SYMBOL(try_release_extent_mapping);
2389
2390 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2391                 get_extent_t *get_extent)
2392 {
2393         struct inode *inode = mapping->host;
2394         u64 start = iblock << inode->i_blkbits;
2395         u64 end = start + (1 << inode->i_blkbits) - 1;
2396         sector_t sector = 0;
2397         struct extent_map *em;
2398
2399         em = get_extent(inode, NULL, 0, start, end, 0);
2400         if (!em || IS_ERR(em))
2401                 return 0;
2402
2403         if (em->block_start == EXTENT_MAP_INLINE ||
2404             em->block_start == EXTENT_MAP_HOLE)
2405                 goto out;
2406
2407         sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2408 out:
2409         free_extent_map(em);
2410         return sector;
2411 }
2412
2413 static int add_lru(struct extent_map_tree *tree, struct extent_buffer *eb)
2414 {
2415         if (list_empty(&eb->lru)) {
2416                 extent_buffer_get(eb);
2417                 list_add(&eb->lru, &tree->buffer_lru);
2418                 tree->lru_size++;
2419                 if (tree->lru_size >= BUFFER_LRU_MAX) {
2420                         struct extent_buffer *rm;
2421                         rm = list_entry(tree->buffer_lru.prev,
2422                                         struct extent_buffer, lru);
2423                         tree->lru_size--;
2424                         list_del_init(&rm->lru);
2425                         free_extent_buffer(rm);
2426                 }
2427         } else
2428                 list_move(&eb->lru, &tree->buffer_lru);
2429         return 0;
2430 }
2431 static struct extent_buffer *find_lru(struct extent_map_tree *tree,
2432                                       u64 start, unsigned long len)
2433 {
2434         struct list_head *lru = &tree->buffer_lru;
2435         struct list_head *cur = lru->next;
2436         struct extent_buffer *eb;
2437
2438         if (list_empty(lru))
2439                 return NULL;
2440
2441         do {
2442                 eb = list_entry(cur, struct extent_buffer, lru);
2443                 if (eb->start == start && eb->len == len) {
2444                         extent_buffer_get(eb);
2445                         return eb;
2446                 }
2447                 cur = cur->next;
2448         } while (cur != lru);
2449         return NULL;
2450 }
2451
2452 static inline unsigned long num_extent_pages(u64 start, u64 len)
2453 {
2454         return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2455                 (start >> PAGE_CACHE_SHIFT);
2456 }
2457
2458 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2459                                               unsigned long i)
2460 {
2461         struct page *p;
2462         struct address_space *mapping;
2463
2464         if (i == 0)
2465                 return eb->first_page;
2466         i += eb->start >> PAGE_CACHE_SHIFT;
2467         mapping = eb->first_page->mapping;
2468         read_lock_irq(&mapping->tree_lock);
2469         p = radix_tree_lookup(&mapping->page_tree, i);
2470         read_unlock_irq(&mapping->tree_lock);
2471         return p;
2472 }
2473
2474 static struct extent_buffer *__alloc_extent_buffer(struct extent_map_tree *tree,
2475                                                    u64 start,
2476                                                    unsigned long len,
2477                                                    gfp_t mask)
2478 {
2479         struct extent_buffer *eb = NULL;
2480
2481         spin_lock(&tree->lru_lock);
2482         eb = find_lru(tree, start, len);
2483         spin_unlock(&tree->lru_lock);
2484         if (eb) {
2485                 return eb;
2486         }
2487
2488         eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2489         INIT_LIST_HEAD(&eb->lru);
2490         eb->start = start;
2491         eb->len = len;
2492         atomic_set(&eb->refs, 1);
2493
2494         return eb;
2495 }
2496
2497 static void __free_extent_buffer(struct extent_buffer *eb)
2498 {
2499         kmem_cache_free(extent_buffer_cache, eb);
2500 }
2501
2502 struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
2503                                           u64 start, unsigned long len,
2504                                           struct page *page0,
2505                                           gfp_t mask)
2506 {
2507         unsigned long num_pages = num_extent_pages(start, len);
2508         unsigned long i;
2509         unsigned long index = start >> PAGE_CACHE_SHIFT;
2510         struct extent_buffer *eb;
2511         struct page *p;
2512         struct address_space *mapping = tree->mapping;
2513         int uptodate = 1;
2514
2515         eb = __alloc_extent_buffer(tree, start, len, mask);
2516         if (!eb || IS_ERR(eb))
2517                 return NULL;
2518
2519         if (eb->flags & EXTENT_BUFFER_FILLED)
2520                 goto lru_add;
2521
2522         if (page0) {
2523                 eb->first_page = page0;
2524                 i = 1;
2525                 index++;
2526                 page_cache_get(page0);
2527                 mark_page_accessed(page0);
2528                 set_page_extent_mapped(page0);
2529                 WARN_ON(!PageUptodate(page0));
2530                 set_page_extent_head(page0, len);
2531         } else {
2532                 i = 0;
2533         }
2534         for (; i < num_pages; i++, index++) {
2535                 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2536                 if (!p) {
2537                         WARN_ON(1);
2538                         goto fail;
2539                 }
2540                 set_page_extent_mapped(p);
2541                 mark_page_accessed(p);
2542                 if (i == 0) {
2543                         eb->first_page = p;
2544                         set_page_extent_head(p, len);
2545                 } else {
2546                         set_page_private(p, EXTENT_PAGE_PRIVATE);
2547                 }
2548                 if (!PageUptodate(p))
2549                         uptodate = 0;
2550                 unlock_page(p);
2551         }
2552         if (uptodate)
2553                 eb->flags |= EXTENT_UPTODATE;
2554         eb->flags |= EXTENT_BUFFER_FILLED;
2555
2556 lru_add:
2557         spin_lock(&tree->lru_lock);
2558         add_lru(tree, eb);
2559         spin_unlock(&tree->lru_lock);
2560         return eb;
2561
2562 fail:
2563         spin_lock(&tree->lru_lock);
2564         list_del_init(&eb->lru);
2565         spin_unlock(&tree->lru_lock);
2566         if (!atomic_dec_and_test(&eb->refs))
2567                 return NULL;
2568         for (index = 1; index < i; index++) {
2569                 page_cache_release(extent_buffer_page(eb, index));
2570         }
2571         if (i > 0)
2572                 page_cache_release(extent_buffer_page(eb, 0));
2573         __free_extent_buffer(eb);
2574         return NULL;
2575 }
2576 EXPORT_SYMBOL(alloc_extent_buffer);
2577
2578 struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
2579                                          u64 start, unsigned long len,
2580                                           gfp_t mask)
2581 {
2582         unsigned long num_pages = num_extent_pages(start, len);
2583         unsigned long i;
2584         unsigned long index = start >> PAGE_CACHE_SHIFT;
2585         struct extent_buffer *eb;
2586         struct page *p;
2587         struct address_space *mapping = tree->mapping;
2588         int uptodate = 1;
2589
2590         eb = __alloc_extent_buffer(tree, start, len, mask);
2591         if (!eb || IS_ERR(eb))
2592                 return NULL;
2593
2594         if (eb->flags & EXTENT_BUFFER_FILLED)
2595                 goto lru_add;
2596
2597         for (i = 0; i < num_pages; i++, index++) {
2598                 p = find_lock_page(mapping, index);
2599                 if (!p) {
2600                         goto fail;
2601                 }
2602                 set_page_extent_mapped(p);
2603                 mark_page_accessed(p);
2604
2605                 if (i == 0) {
2606                         eb->first_page = p;
2607                         set_page_extent_head(p, len);
2608                 } else {
2609                         set_page_private(p, EXTENT_PAGE_PRIVATE);
2610                 }
2611
2612                 if (!PageUptodate(p))
2613                         uptodate = 0;
2614                 unlock_page(p);
2615         }
2616         if (uptodate)
2617                 eb->flags |= EXTENT_UPTODATE;
2618         eb->flags |= EXTENT_BUFFER_FILLED;
2619
2620 lru_add:
2621         spin_lock(&tree->lru_lock);
2622         add_lru(tree, eb);
2623         spin_unlock(&tree->lru_lock);
2624         return eb;
2625 fail:
2626         spin_lock(&tree->lru_lock);
2627         list_del_init(&eb->lru);
2628         spin_unlock(&tree->lru_lock);
2629         if (!atomic_dec_and_test(&eb->refs))
2630                 return NULL;
2631         for (index = 1; index < i; index++) {
2632                 page_cache_release(extent_buffer_page(eb, index));
2633         }
2634         if (i > 0)
2635                 page_cache_release(extent_buffer_page(eb, 0));
2636         __free_extent_buffer(eb);
2637         return NULL;
2638 }
2639 EXPORT_SYMBOL(find_extent_buffer);
2640
2641 void free_extent_buffer(struct extent_buffer *eb)
2642 {
2643         unsigned long i;
2644         unsigned long num_pages;
2645
2646         if (!eb)
2647                 return;
2648
2649         if (!atomic_dec_and_test(&eb->refs))
2650                 return;
2651
2652         WARN_ON(!list_empty(&eb->lru));
2653         num_pages = num_extent_pages(eb->start, eb->len);
2654
2655         for (i = 1; i < num_pages; i++) {
2656                 page_cache_release(extent_buffer_page(eb, i));
2657         }
2658         page_cache_release(extent_buffer_page(eb, 0));
2659         __free_extent_buffer(eb);
2660 }
2661 EXPORT_SYMBOL(free_extent_buffer);
2662
2663 int clear_extent_buffer_dirty(struct extent_map_tree *tree,
2664                               struct extent_buffer *eb)
2665 {
2666         int set;
2667         unsigned long i;
2668         unsigned long num_pages;
2669         struct page *page;
2670
2671         u64 start = eb->start;
2672         u64 end = start + eb->len - 1;
2673
2674         set = clear_extent_dirty(tree, start, end, GFP_NOFS);
2675         num_pages = num_extent_pages(eb->start, eb->len);
2676
2677         for (i = 0; i < num_pages; i++) {
2678                 page = extent_buffer_page(eb, i);
2679                 lock_page(page);
2680                 if (i == 0)
2681                         set_page_extent_head(page, eb->len);
2682                 else
2683                         set_page_private(page, EXTENT_PAGE_PRIVATE);
2684
2685                 /*
2686                  * if we're on the last page or the first page and the
2687                  * block isn't aligned on a page boundary, do extra checks
2688                  * to make sure we don't clean page that is partially dirty
2689                  */
2690                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2691                     ((i == num_pages - 1) &&
2692                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2693                         start = (u64)page->index << PAGE_CACHE_SHIFT;
2694                         end  = start + PAGE_CACHE_SIZE - 1;
2695                         if (test_range_bit(tree, start, end,
2696                                            EXTENT_DIRTY, 0)) {
2697                                 unlock_page(page);
2698                                 continue;
2699                         }
2700                 }
2701                 clear_page_dirty_for_io(page);
2702                 write_lock_irq(&page->mapping->tree_lock);
2703                 if (!PageDirty(page)) {
2704                         radix_tree_tag_clear(&page->mapping->page_tree,
2705                                                 page_index(page),
2706                                                 PAGECACHE_TAG_DIRTY);
2707                 }
2708                 write_unlock_irq(&page->mapping->tree_lock);
2709                 unlock_page(page);
2710         }
2711         return 0;
2712 }
2713 EXPORT_SYMBOL(clear_extent_buffer_dirty);
2714
2715 int wait_on_extent_buffer_writeback(struct extent_map_tree *tree,
2716                                     struct extent_buffer *eb)
2717 {
2718         return wait_on_extent_writeback(tree, eb->start,
2719                                         eb->start + eb->len - 1);
2720 }
2721 EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
2722
2723 int set_extent_buffer_dirty(struct extent_map_tree *tree,
2724                              struct extent_buffer *eb)
2725 {
2726         unsigned long i;
2727         unsigned long num_pages;
2728
2729         num_pages = num_extent_pages(eb->start, eb->len);
2730         for (i = 0; i < num_pages; i++) {
2731                 struct page *page = extent_buffer_page(eb, i);
2732                 /* writepage may need to do something special for the
2733                  * first page, we have to make sure page->private is
2734                  * properly set.  releasepage may drop page->private
2735                  * on us if the page isn't already dirty.
2736                  */
2737                 if (i == 0) {
2738                         lock_page(page);
2739                         set_page_extent_head(page, eb->len);
2740                 } else if (PagePrivate(page) &&
2741                            page->private != EXTENT_PAGE_PRIVATE) {
2742                         lock_page(page);
2743                         set_page_extent_mapped(page);
2744                         unlock_page(page);
2745                 }
2746                 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
2747                 if (i == 0)
2748                         unlock_page(page);
2749         }
2750         return set_extent_dirty(tree, eb->start,
2751                                 eb->start + eb->len - 1, GFP_NOFS);
2752 }
2753 EXPORT_SYMBOL(set_extent_buffer_dirty);
2754
2755 int set_extent_buffer_uptodate(struct extent_map_tree *tree,
2756                                 struct extent_buffer *eb)
2757 {
2758         unsigned long i;
2759         struct page *page;
2760         unsigned long num_pages;
2761
2762         num_pages = num_extent_pages(eb->start, eb->len);
2763
2764         set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2765                             GFP_NOFS);
2766         for (i = 0; i < num_pages; i++) {
2767                 page = extent_buffer_page(eb, i);
2768                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2769                     ((i == num_pages - 1) &&
2770                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2771                         check_page_uptodate(tree, page);
2772                         continue;
2773                 }
2774                 SetPageUptodate(page);
2775         }
2776         return 0;
2777 }
2778 EXPORT_SYMBOL(set_extent_buffer_uptodate);
2779
2780 int extent_buffer_uptodate(struct extent_map_tree *tree,
2781                              struct extent_buffer *eb)
2782 {
2783         if (eb->flags & EXTENT_UPTODATE)
2784                 return 1;
2785         return test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2786                            EXTENT_UPTODATE, 1);
2787 }
2788 EXPORT_SYMBOL(extent_buffer_uptodate);
2789
2790 int read_extent_buffer_pages(struct extent_map_tree *tree,
2791                              struct extent_buffer *eb,
2792                              u64 start,
2793                              int wait)
2794 {
2795         unsigned long i;
2796         unsigned long start_i;
2797         struct page *page;
2798         int err;
2799         int ret = 0;
2800         unsigned long num_pages;
2801
2802         if (eb->flags & EXTENT_UPTODATE)
2803                 return 0;
2804
2805         if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2806                            EXTENT_UPTODATE, 1)) {
2807                 return 0;
2808         }
2809
2810         if (start) {
2811                 WARN_ON(start < eb->start);
2812                 start_i = (start >> PAGE_CACHE_SHIFT) -
2813                         (eb->start >> PAGE_CACHE_SHIFT);
2814         } else {
2815                 start_i = 0;
2816         }
2817
2818         num_pages = num_extent_pages(eb->start, eb->len);
2819         for (i = start_i; i < num_pages; i++) {
2820                 page = extent_buffer_page(eb, i);
2821                 if (PageUptodate(page)) {
2822                         continue;
2823                 }
2824                 if (!wait) {
2825                         if (TestSetPageLocked(page)) {
2826                                 continue;
2827                         }
2828                 } else {
2829                         lock_page(page);
2830                 }
2831                 if (!PageUptodate(page)) {
2832                         err = page->mapping->a_ops->readpage(NULL, page);
2833                         if (err) {
2834                                 ret = err;
2835                         }
2836                 } else {
2837                         unlock_page(page);
2838                 }
2839         }
2840
2841         if (ret || !wait) {
2842                 return ret;
2843         }
2844
2845         for (i = start_i; i < num_pages; i++) {
2846                 page = extent_buffer_page(eb, i);
2847                 wait_on_page_locked(page);
2848                 if (!PageUptodate(page)) {
2849                         ret = -EIO;
2850                 }
2851         }
2852         if (!ret)
2853                 eb->flags |= EXTENT_UPTODATE;
2854         return ret;
2855 }
2856 EXPORT_SYMBOL(read_extent_buffer_pages);
2857
2858 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
2859                         unsigned long start,
2860                         unsigned long len)
2861 {
2862         size_t cur;
2863         size_t offset;
2864         struct page *page;
2865         char *kaddr;
2866         char *dst = (char *)dstv;
2867         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2868         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2869         unsigned long num_pages = num_extent_pages(eb->start, eb->len);
2870
2871         WARN_ON(start > eb->len);
2872         WARN_ON(start + len > eb->start + eb->len);
2873
2874         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
2875
2876         while(len > 0) {
2877                 page = extent_buffer_page(eb, i);
2878                 if (!PageUptodate(page)) {
2879                         printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len);
2880                         WARN_ON(1);
2881                 }
2882                 WARN_ON(!PageUptodate(page));
2883
2884                 cur = min(len, (PAGE_CACHE_SIZE - offset));
2885                 kaddr = kmap_atomic(page, KM_USER1);
2886                 memcpy(dst, kaddr + offset, cur);
2887                 kunmap_atomic(kaddr, KM_USER1);
2888
2889                 dst += cur;
2890                 len -= cur;
2891                 offset = 0;
2892                 i++;
2893         }
2894 }
2895 EXPORT_SYMBOL(read_extent_buffer);
2896
2897 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
2898                                unsigned long min_len, char **token, char **map,
2899                                unsigned long *map_start,
2900                                unsigned long *map_len, int km)
2901 {
2902         size_t offset = start & (PAGE_CACHE_SIZE - 1);
2903         char *kaddr;
2904         struct page *p;
2905         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2906         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2907         unsigned long end_i = (start_offset + start + min_len - 1) >>
2908                 PAGE_CACHE_SHIFT;
2909
2910         if (i != end_i)
2911                 return -EINVAL;
2912
2913         if (i == 0) {
2914                 offset = start_offset;
2915                 *map_start = 0;
2916         } else {
2917                 offset = 0;
2918                 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
2919         }
2920         if (start + min_len > eb->len) {
2921 printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
2922                 WARN_ON(1);
2923         }
2924
2925         p = extent_buffer_page(eb, i);
2926         WARN_ON(!PageUptodate(p));
2927         kaddr = kmap_atomic(p, km);
2928         *token = kaddr;
2929         *map = kaddr + offset;
2930         *map_len = PAGE_CACHE_SIZE - offset;
2931         return 0;
2932 }
2933 EXPORT_SYMBOL(map_private_extent_buffer);
2934
2935 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
2936                       unsigned long min_len,
2937                       char **token, char **map,
2938                       unsigned long *map_start,
2939                       unsigned long *map_len, int km)
2940 {
2941         int err;
2942         int save = 0;
2943         if (eb->map_token) {
2944                 unmap_extent_buffer(eb, eb->map_token, km);
2945                 eb->map_token = NULL;
2946                 save = 1;
2947         }
2948         err = map_private_extent_buffer(eb, start, min_len, token, map,
2949                                        map_start, map_len, km);
2950         if (!err && save) {
2951                 eb->map_token = *token;
2952                 eb->kaddr = *map;
2953                 eb->map_start = *map_start;
2954                 eb->map_len = *map_len;
2955         }
2956         return err;
2957 }
2958 EXPORT_SYMBOL(map_extent_buffer);
2959
2960 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
2961 {
2962         kunmap_atomic(token, km);
2963 }
2964 EXPORT_SYMBOL(unmap_extent_buffer);
2965
2966 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
2967                           unsigned long start,
2968                           unsigned long len)
2969 {
2970         size_t cur;
2971         size_t offset;
2972         struct page *page;
2973         char *kaddr;
2974         char *ptr = (char *)ptrv;
2975         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2976         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2977         int ret = 0;
2978
2979         WARN_ON(start > eb->len);
2980         WARN_ON(start + len > eb->start + eb->len);
2981
2982         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
2983
2984         while(len > 0) {
2985                 page = extent_buffer_page(eb, i);
2986                 WARN_ON(!PageUptodate(page));
2987
2988                 cur = min(len, (PAGE_CACHE_SIZE - offset));
2989
2990                 kaddr = kmap_atomic(page, KM_USER0);
2991                 ret = memcmp(ptr, kaddr + offset, cur);
2992                 kunmap_atomic(kaddr, KM_USER0);
2993                 if (ret)
2994                         break;
2995
2996                 ptr += cur;
2997                 len -= cur;
2998                 offset = 0;
2999                 i++;
3000         }
3001         return ret;
3002 }
3003 EXPORT_SYMBOL(memcmp_extent_buffer);
3004
3005 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3006                          unsigned long start, unsigned long len)
3007 {
3008         size_t cur;
3009         size_t offset;
3010         struct page *page;
3011         char *kaddr;
3012         char *src = (char *)srcv;
3013         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3014         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3015
3016         WARN_ON(start > eb->len);
3017         WARN_ON(start + len > eb->start + eb->len);
3018
3019         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3020
3021         while(len > 0) {
3022                 page = extent_buffer_page(eb, i);
3023                 WARN_ON(!PageUptodate(page));
3024
3025                 cur = min(len, PAGE_CACHE_SIZE - offset);
3026                 kaddr = kmap_atomic(page, KM_USER1);
3027                 memcpy(kaddr + offset, src, cur);
3028                 kunmap_atomic(kaddr, KM_USER1);
3029
3030                 src += cur;
3031                 len -= cur;
3032                 offset = 0;
3033                 i++;
3034         }
3035 }
3036 EXPORT_SYMBOL(write_extent_buffer);
3037
3038 void memset_extent_buffer(struct extent_buffer *eb, char c,
3039                           unsigned long start, unsigned long len)
3040 {
3041         size_t cur;
3042         size_t offset;
3043         struct page *page;
3044         char *kaddr;
3045         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3046         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3047
3048         WARN_ON(start > eb->len);
3049         WARN_ON(start + len > eb->start + eb->len);
3050
3051         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3052
3053         while(len > 0) {
3054                 page = extent_buffer_page(eb, i);
3055                 WARN_ON(!PageUptodate(page));
3056
3057                 cur = min(len, PAGE_CACHE_SIZE - offset);
3058                 kaddr = kmap_atomic(page, KM_USER0);
3059                 memset(kaddr + offset, c, cur);
3060                 kunmap_atomic(kaddr, KM_USER0);
3061
3062                 len -= cur;
3063                 offset = 0;
3064                 i++;
3065         }
3066 }
3067 EXPORT_SYMBOL(memset_extent_buffer);
3068
3069 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3070                         unsigned long dst_offset, unsigned long src_offset,
3071                         unsigned long len)
3072 {
3073         u64 dst_len = dst->len;
3074         size_t cur;
3075         size_t offset;
3076         struct page *page;
3077         char *kaddr;
3078         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3079         unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3080
3081         WARN_ON(src->len != dst_len);
3082
3083         offset = (start_offset + dst_offset) &
3084                 ((unsigned long)PAGE_CACHE_SIZE - 1);
3085
3086         while(len > 0) {
3087                 page = extent_buffer_page(dst, i);
3088                 WARN_ON(!PageUptodate(page));
3089
3090                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3091
3092                 kaddr = kmap_atomic(page, KM_USER0);
3093                 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3094                 kunmap_atomic(kaddr, KM_USER0);
3095
3096                 src_offset += cur;
3097                 len -= cur;
3098                 offset = 0;
3099                 i++;
3100         }
3101 }
3102 EXPORT_SYMBOL(copy_extent_buffer);
3103
3104 static void move_pages(struct page *dst_page, struct page *src_page,
3105                        unsigned long dst_off, unsigned long src_off,
3106                        unsigned long len)
3107 {
3108         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3109         if (dst_page == src_page) {
3110                 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3111         } else {
3112                 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3113                 char *p = dst_kaddr + dst_off + len;
3114                 char *s = src_kaddr + src_off + len;
3115
3116                 while (len--)
3117                         *--p = *--s;
3118
3119                 kunmap_atomic(src_kaddr, KM_USER1);
3120         }
3121         kunmap_atomic(dst_kaddr, KM_USER0);
3122 }
3123
3124 static void copy_pages(struct page *dst_page, struct page *src_page,
3125                        unsigned long dst_off, unsigned long src_off,
3126                        unsigned long len)
3127 {
3128         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3129         char *src_kaddr;
3130
3131         if (dst_page != src_page)
3132                 src_kaddr = kmap_atomic(src_page, KM_USER1);
3133         else
3134                 src_kaddr = dst_kaddr;
3135
3136         memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3137         kunmap_atomic(dst_kaddr, KM_USER0);
3138         if (dst_page != src_page)
3139                 kunmap_atomic(src_kaddr, KM_USER1);
3140 }
3141
3142 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3143                            unsigned long src_offset, unsigned long len)
3144 {
3145         size_t cur;
3146         size_t dst_off_in_page;
3147         size_t src_off_in_page;
3148         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3149         unsigned long dst_i;
3150         unsigned long src_i;
3151
3152         if (src_offset + len > dst->len) {
3153                 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3154                        src_offset, len, dst->len);
3155                 BUG_ON(1);
3156         }
3157         if (dst_offset + len > dst->len) {
3158                 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3159                        dst_offset, len, dst->len);
3160                 BUG_ON(1);
3161         }
3162
3163         while(len > 0) {
3164                 dst_off_in_page = (start_offset + dst_offset) &
3165                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3166                 src_off_in_page = (start_offset + src_offset) &
3167                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3168
3169                 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3170                 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3171
3172                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3173                                                src_off_in_page));
3174                 cur = min_t(unsigned long, cur,
3175                         (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3176
3177                 copy_pages(extent_buffer_page(dst, dst_i),
3178                            extent_buffer_page(dst, src_i),
3179                            dst_off_in_page, src_off_in_page, cur);
3180
3181                 src_offset += cur;
3182                 dst_offset += cur;
3183                 len -= cur;
3184         }
3185 }
3186 EXPORT_SYMBOL(memcpy_extent_buffer);
3187
3188 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3189                            unsigned long src_offset, unsigned long len)
3190 {
3191         size_t cur;
3192         size_t dst_off_in_page;
3193         size_t src_off_in_page;
3194         unsigned long dst_end = dst_offset + len - 1;
3195         unsigned long src_end = src_offset + len - 1;
3196         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3197         unsigned long dst_i;
3198         unsigned long src_i;
3199
3200         if (src_offset + len > dst->len) {
3201                 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3202                        src_offset, len, dst->len);
3203                 BUG_ON(1);
3204         }
3205         if (dst_offset + len > dst->len) {
3206                 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3207                        dst_offset, len, dst->len);
3208                 BUG_ON(1);
3209         }
3210         if (dst_offset < src_offset) {
3211                 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3212                 return;
3213         }
3214         while(len > 0) {
3215                 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3216                 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3217
3218                 dst_off_in_page = (start_offset + dst_end) &
3219                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3220                 src_off_in_page = (start_offset + src_end) &
3221                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3222
3223                 cur = min_t(unsigned long, len, src_off_in_page + 1);
3224                 cur = min(cur, dst_off_in_page + 1);
3225                 move_pages(extent_buffer_page(dst, dst_i),
3226                            extent_buffer_page(dst, src_i),
3227                            dst_off_in_page - cur + 1,
3228                            src_off_in_page - cur + 1, cur);
3229
3230                 dst_end -= cur;
3231                 src_end -= cur;
3232                 len -= cur;
3233         }
3234 }
3235 EXPORT_SYMBOL(memmove_extent_buffer);