btrfs: 32-bit type problems
[safe/jmp/linux-2.6] / fs / btrfs / extent_map.c
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/gfp.h>
6 #include <linux/pagemap.h>
7 #include <linux/page-flags.h>
8 #include <linux/module.h>
9 #include <linux/spinlock.h>
10 #include <linux/blkdev.h>
11 #include <linux/swap.h>
12 #include "extent_map.h"
13
14 /* temporary define until extent_map moves out of btrfs */
15 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
16                                        unsigned long extra_flags,
17                                        void (*ctor)(void *, struct kmem_cache *,
18                                                     unsigned long));
19
20 static struct kmem_cache *extent_map_cache;
21 static struct kmem_cache *extent_state_cache;
22 static struct kmem_cache *extent_buffer_cache;
23
24 static LIST_HEAD(buffers);
25 static LIST_HEAD(states);
26
27 static spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
28 #define BUFFER_LRU_MAX 64
29
30 struct tree_entry {
31         u64 start;
32         u64 end;
33         int in_tree;
34         struct rb_node rb_node;
35 };
36
37 void __init extent_map_init(void)
38 {
39         extent_map_cache = btrfs_cache_create("extent_map",
40                                             sizeof(struct extent_map), 0,
41                                             NULL);
42         extent_state_cache = btrfs_cache_create("extent_state",
43                                             sizeof(struct extent_state), 0,
44                                             NULL);
45         extent_buffer_cache = btrfs_cache_create("extent_buffers",
46                                             sizeof(struct extent_buffer), 0,
47                                             NULL);
48 }
49
50 void __exit extent_map_exit(void)
51 {
52         struct extent_state *state;
53
54         while (!list_empty(&states)) {
55                 state = list_entry(states.next, struct extent_state, list);
56                 printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs));
57                 list_del(&state->list);
58                 kmem_cache_free(extent_state_cache, state);
59
60         }
61
62         if (extent_map_cache)
63                 kmem_cache_destroy(extent_map_cache);
64         if (extent_state_cache)
65                 kmem_cache_destroy(extent_state_cache);
66         if (extent_buffer_cache)
67                 kmem_cache_destroy(extent_buffer_cache);
68 }
69
70 void extent_map_tree_init(struct extent_map_tree *tree,
71                           struct address_space *mapping, gfp_t mask)
72 {
73         tree->map.rb_node = NULL;
74         tree->state.rb_node = NULL;
75         tree->ops = NULL;
76         rwlock_init(&tree->lock);
77         spin_lock_init(&tree->lru_lock);
78         tree->mapping = mapping;
79         INIT_LIST_HEAD(&tree->buffer_lru);
80         tree->lru_size = 0;
81 }
82 EXPORT_SYMBOL(extent_map_tree_init);
83
84 void extent_map_tree_empty_lru(struct extent_map_tree *tree)
85 {
86         struct extent_buffer *eb;
87         while(!list_empty(&tree->buffer_lru)) {
88                 eb = list_entry(tree->buffer_lru.next, struct extent_buffer,
89                                 lru);
90                 list_del(&eb->lru);
91                 free_extent_buffer(eb);
92         }
93 }
94 EXPORT_SYMBOL(extent_map_tree_empty_lru);
95
96 struct extent_map *alloc_extent_map(gfp_t mask)
97 {
98         struct extent_map *em;
99         em = kmem_cache_alloc(extent_map_cache, mask);
100         if (!em || IS_ERR(em))
101                 return em;
102         em->in_tree = 0;
103         atomic_set(&em->refs, 1);
104         return em;
105 }
106 EXPORT_SYMBOL(alloc_extent_map);
107
108 void free_extent_map(struct extent_map *em)
109 {
110         if (!em)
111                 return;
112         if (atomic_dec_and_test(&em->refs)) {
113                 WARN_ON(em->in_tree);
114                 kmem_cache_free(extent_map_cache, em);
115         }
116 }
117 EXPORT_SYMBOL(free_extent_map);
118
119
120 struct extent_state *alloc_extent_state(gfp_t mask)
121 {
122         struct extent_state *state;
123         unsigned long flags;
124
125         state = kmem_cache_alloc(extent_state_cache, mask);
126         if (!state || IS_ERR(state))
127                 return state;
128         state->state = 0;
129         state->in_tree = 0;
130         state->private = 0;
131
132         spin_lock_irqsave(&state_lock, flags);
133         list_add(&state->list, &states);
134         spin_unlock_irqrestore(&state_lock, flags);
135
136         atomic_set(&state->refs, 1);
137         init_waitqueue_head(&state->wq);
138         return state;
139 }
140 EXPORT_SYMBOL(alloc_extent_state);
141
142 void free_extent_state(struct extent_state *state)
143 {
144         unsigned long flags;
145         if (!state)
146                 return;
147         if (atomic_dec_and_test(&state->refs)) {
148                 WARN_ON(state->in_tree);
149                 spin_lock_irqsave(&state_lock, flags);
150                 list_del(&state->list);
151                 spin_unlock_irqrestore(&state_lock, flags);
152                 kmem_cache_free(extent_state_cache, state);
153         }
154 }
155 EXPORT_SYMBOL(free_extent_state);
156
157 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
158                                    struct rb_node *node)
159 {
160         struct rb_node ** p = &root->rb_node;
161         struct rb_node * parent = NULL;
162         struct tree_entry *entry;
163
164         while(*p) {
165                 parent = *p;
166                 entry = rb_entry(parent, struct tree_entry, rb_node);
167
168                 if (offset < entry->start)
169                         p = &(*p)->rb_left;
170                 else if (offset > entry->end)
171                         p = &(*p)->rb_right;
172                 else
173                         return parent;
174         }
175
176         entry = rb_entry(node, struct tree_entry, rb_node);
177         entry->in_tree = 1;
178         rb_link_node(node, parent, p);
179         rb_insert_color(node, root);
180         return NULL;
181 }
182
183 static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
184                                    struct rb_node **prev_ret)
185 {
186         struct rb_node * n = root->rb_node;
187         struct rb_node *prev = NULL;
188         struct tree_entry *entry;
189         struct tree_entry *prev_entry = NULL;
190
191         while(n) {
192                 entry = rb_entry(n, struct tree_entry, rb_node);
193                 prev = n;
194                 prev_entry = entry;
195
196                 if (offset < entry->start)
197                         n = n->rb_left;
198                 else if (offset > entry->end)
199                         n = n->rb_right;
200                 else
201                         return n;
202         }
203         if (!prev_ret)
204                 return NULL;
205         while(prev && offset > prev_entry->end) {
206                 prev = rb_next(prev);
207                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
208         }
209         *prev_ret = prev;
210         return NULL;
211 }
212
213 static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
214 {
215         struct rb_node *prev;
216         struct rb_node *ret;
217         ret = __tree_search(root, offset, &prev);
218         if (!ret)
219                 return prev;
220         return ret;
221 }
222
223 static int tree_delete(struct rb_root *root, u64 offset)
224 {
225         struct rb_node *node;
226         struct tree_entry *entry;
227
228         node = __tree_search(root, offset, NULL);
229         if (!node)
230                 return -ENOENT;
231         entry = rb_entry(node, struct tree_entry, rb_node);
232         entry->in_tree = 0;
233         rb_erase(node, root);
234         return 0;
235 }
236
237 /*
238  * add_extent_mapping tries a simple backward merge with existing
239  * mappings.  The extent_map struct passed in will be inserted into
240  * the tree directly (no copies made, just a reference taken).
241  */
242 int add_extent_mapping(struct extent_map_tree *tree,
243                        struct extent_map *em)
244 {
245         int ret = 0;
246         struct extent_map *prev = NULL;
247         struct rb_node *rb;
248
249         write_lock_irq(&tree->lock);
250         rb = tree_insert(&tree->map, em->end, &em->rb_node);
251         if (rb) {
252                 prev = rb_entry(rb, struct extent_map, rb_node);
253                 printk("found extent map %Lu %Lu on insert of %Lu %Lu\n", prev->start, prev->end, em->start, em->end);
254                 ret = -EEXIST;
255                 goto out;
256         }
257         atomic_inc(&em->refs);
258         if (em->start != 0) {
259                 rb = rb_prev(&em->rb_node);
260                 if (rb)
261                         prev = rb_entry(rb, struct extent_map, rb_node);
262                 if (prev && prev->end + 1 == em->start &&
263                     ((em->block_start == EXTENT_MAP_HOLE &&
264                       prev->block_start == EXTENT_MAP_HOLE) ||
265                              (em->block_start == prev->block_end + 1))) {
266                         em->start = prev->start;
267                         em->block_start = prev->block_start;
268                         rb_erase(&prev->rb_node, &tree->map);
269                         prev->in_tree = 0;
270                         free_extent_map(prev);
271                 }
272          }
273 out:
274         write_unlock_irq(&tree->lock);
275         return ret;
276 }
277 EXPORT_SYMBOL(add_extent_mapping);
278
279 /*
280  * lookup_extent_mapping returns the first extent_map struct in the
281  * tree that intersects the [start, end] (inclusive) range.  There may
282  * be additional objects in the tree that intersect, so check the object
283  * returned carefully to make sure you don't need additional lookups.
284  */
285 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
286                                          u64 start, u64 end)
287 {
288         struct extent_map *em;
289         struct rb_node *rb_node;
290
291         read_lock_irq(&tree->lock);
292         rb_node = tree_search(&tree->map, start);
293         if (!rb_node) {
294                 em = NULL;
295                 goto out;
296         }
297         if (IS_ERR(rb_node)) {
298                 em = ERR_PTR(PTR_ERR(rb_node));
299                 goto out;
300         }
301         em = rb_entry(rb_node, struct extent_map, rb_node);
302         if (em->end < start || em->start > end) {
303                 em = NULL;
304                 goto out;
305         }
306         atomic_inc(&em->refs);
307 out:
308         read_unlock_irq(&tree->lock);
309         return em;
310 }
311 EXPORT_SYMBOL(lookup_extent_mapping);
312
313 /*
314  * removes an extent_map struct from the tree.  No reference counts are
315  * dropped, and no checks are done to  see if the range is in use
316  */
317 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
318 {
319         int ret;
320
321         write_lock_irq(&tree->lock);
322         ret = tree_delete(&tree->map, em->end);
323         write_unlock_irq(&tree->lock);
324         return ret;
325 }
326 EXPORT_SYMBOL(remove_extent_mapping);
327
328 /*
329  * utility function to look for merge candidates inside a given range.
330  * Any extents with matching state are merged together into a single
331  * extent in the tree.  Extents with EXTENT_IO in their state field
332  * are not merged because the end_io handlers need to be able to do
333  * operations on them without sleeping (or doing allocations/splits).
334  *
335  * This should be called with the tree lock held.
336  */
337 static int merge_state(struct extent_map_tree *tree,
338                        struct extent_state *state)
339 {
340         struct extent_state *other;
341         struct rb_node *other_node;
342
343         if (state->state & EXTENT_IOBITS)
344                 return 0;
345
346         other_node = rb_prev(&state->rb_node);
347         if (other_node) {
348                 other = rb_entry(other_node, struct extent_state, rb_node);
349                 if (other->end == state->start - 1 &&
350                     other->state == state->state) {
351                         state->start = other->start;
352                         other->in_tree = 0;
353                         rb_erase(&other->rb_node, &tree->state);
354                         free_extent_state(other);
355                 }
356         }
357         other_node = rb_next(&state->rb_node);
358         if (other_node) {
359                 other = rb_entry(other_node, struct extent_state, rb_node);
360                 if (other->start == state->end + 1 &&
361                     other->state == state->state) {
362                         other->start = state->start;
363                         state->in_tree = 0;
364                         rb_erase(&state->rb_node, &tree->state);
365                         free_extent_state(state);
366                 }
367         }
368         return 0;
369 }
370
371 /*
372  * insert an extent_state struct into the tree.  'bits' are set on the
373  * struct before it is inserted.
374  *
375  * This may return -EEXIST if the extent is already there, in which case the
376  * state struct is freed.
377  *
378  * The tree lock is not taken internally.  This is a utility function and
379  * probably isn't what you want to call (see set/clear_extent_bit).
380  */
381 static int insert_state(struct extent_map_tree *tree,
382                         struct extent_state *state, u64 start, u64 end,
383                         int bits)
384 {
385         struct rb_node *node;
386
387         if (end < start) {
388                 printk("end < start %Lu %Lu\n", end, start);
389                 WARN_ON(1);
390         }
391         state->state |= bits;
392         state->start = start;
393         state->end = end;
394         node = tree_insert(&tree->state, end, &state->rb_node);
395         if (node) {
396                 struct extent_state *found;
397                 found = rb_entry(node, struct extent_state, rb_node);
398                 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
399                 free_extent_state(state);
400                 return -EEXIST;
401         }
402         merge_state(tree, state);
403         return 0;
404 }
405
406 /*
407  * split a given extent state struct in two, inserting the preallocated
408  * struct 'prealloc' as the newly created second half.  'split' indicates an
409  * offset inside 'orig' where it should be split.
410  *
411  * Before calling,
412  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
413  * are two extent state structs in the tree:
414  * prealloc: [orig->start, split - 1]
415  * orig: [ split, orig->end ]
416  *
417  * The tree locks are not taken by this function. They need to be held
418  * by the caller.
419  */
420 static int split_state(struct extent_map_tree *tree, struct extent_state *orig,
421                        struct extent_state *prealloc, u64 split)
422 {
423         struct rb_node *node;
424         prealloc->start = orig->start;
425         prealloc->end = split - 1;
426         prealloc->state = orig->state;
427         orig->start = split;
428
429         node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
430         if (node) {
431                 struct extent_state *found;
432                 found = rb_entry(node, struct extent_state, rb_node);
433                 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
434                 free_extent_state(prealloc);
435                 return -EEXIST;
436         }
437         return 0;
438 }
439
440 /*
441  * utility function to clear some bits in an extent state struct.
442  * it will optionally wake up any one waiting on this state (wake == 1), or
443  * forcibly remove the state from the tree (delete == 1).
444  *
445  * If no bits are set on the state struct after clearing things, the
446  * struct is freed and removed from the tree
447  */
448 static int clear_state_bit(struct extent_map_tree *tree,
449                             struct extent_state *state, int bits, int wake,
450                             int delete)
451 {
452         int ret = state->state & bits;
453         state->state &= ~bits;
454         if (wake)
455                 wake_up(&state->wq);
456         if (delete || state->state == 0) {
457                 if (state->in_tree) {
458                         rb_erase(&state->rb_node, &tree->state);
459                         state->in_tree = 0;
460                         free_extent_state(state);
461                 } else {
462                         WARN_ON(1);
463                 }
464         } else {
465                 merge_state(tree, state);
466         }
467         return ret;
468 }
469
470 /*
471  * clear some bits on a range in the tree.  This may require splitting
472  * or inserting elements in the tree, so the gfp mask is used to
473  * indicate which allocations or sleeping are allowed.
474  *
475  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
476  * the given range from the tree regardless of state (ie for truncate).
477  *
478  * the range [start, end] is inclusive.
479  *
480  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
481  * bits were already set, or zero if none of the bits were already set.
482  */
483 int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end,
484                      int bits, int wake, int delete, gfp_t mask)
485 {
486         struct extent_state *state;
487         struct extent_state *prealloc = NULL;
488         struct rb_node *node;
489         unsigned long flags;
490         int err;
491         int set = 0;
492
493 again:
494         if (!prealloc && (mask & __GFP_WAIT)) {
495                 prealloc = alloc_extent_state(mask);
496                 if (!prealloc)
497                         return -ENOMEM;
498         }
499
500         write_lock_irqsave(&tree->lock, flags);
501         /*
502          * this search will find the extents that end after
503          * our range starts
504          */
505         node = tree_search(&tree->state, start);
506         if (!node)
507                 goto out;
508         state = rb_entry(node, struct extent_state, rb_node);
509         if (state->start > end)
510                 goto out;
511         WARN_ON(state->end < start);
512
513         /*
514          *     | ---- desired range ---- |
515          *  | state | or
516          *  | ------------- state -------------- |
517          *
518          * We need to split the extent we found, and may flip
519          * bits on second half.
520          *
521          * If the extent we found extends past our range, we
522          * just split and search again.  It'll get split again
523          * the next time though.
524          *
525          * If the extent we found is inside our range, we clear
526          * the desired bit on it.
527          */
528
529         if (state->start < start) {
530                 err = split_state(tree, state, prealloc, start);
531                 BUG_ON(err == -EEXIST);
532                 prealloc = NULL;
533                 if (err)
534                         goto out;
535                 if (state->end <= end) {
536                         start = state->end + 1;
537                         set |= clear_state_bit(tree, state, bits,
538                                         wake, delete);
539                 } else {
540                         start = state->start;
541                 }
542                 goto search_again;
543         }
544         /*
545          * | ---- desired range ---- |
546          *                        | state |
547          * We need to split the extent, and clear the bit
548          * on the first half
549          */
550         if (state->start <= end && state->end > end) {
551                 err = split_state(tree, state, prealloc, end + 1);
552                 BUG_ON(err == -EEXIST);
553
554                 if (wake)
555                         wake_up(&state->wq);
556                 set |= clear_state_bit(tree, prealloc, bits,
557                                        wake, delete);
558                 prealloc = NULL;
559                 goto out;
560         }
561
562         start = state->end + 1;
563         set |= clear_state_bit(tree, state, bits, wake, delete);
564         goto search_again;
565
566 out:
567         write_unlock_irqrestore(&tree->lock, flags);
568         if (prealloc)
569                 free_extent_state(prealloc);
570
571         return set;
572
573 search_again:
574         if (start > end)
575                 goto out;
576         write_unlock_irqrestore(&tree->lock, flags);
577         if (mask & __GFP_WAIT)
578                 cond_resched();
579         goto again;
580 }
581 EXPORT_SYMBOL(clear_extent_bit);
582
583 static int wait_on_state(struct extent_map_tree *tree,
584                          struct extent_state *state)
585 {
586         DEFINE_WAIT(wait);
587         prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
588         read_unlock_irq(&tree->lock);
589         schedule();
590         read_lock_irq(&tree->lock);
591         finish_wait(&state->wq, &wait);
592         return 0;
593 }
594
595 /*
596  * waits for one or more bits to clear on a range in the state tree.
597  * The range [start, end] is inclusive.
598  * The tree lock is taken by this function
599  */
600 int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits)
601 {
602         struct extent_state *state;
603         struct rb_node *node;
604
605         read_lock_irq(&tree->lock);
606 again:
607         while (1) {
608                 /*
609                  * this search will find all the extents that end after
610                  * our range starts
611                  */
612                 node = tree_search(&tree->state, start);
613                 if (!node)
614                         break;
615
616                 state = rb_entry(node, struct extent_state, rb_node);
617
618                 if (state->start > end)
619                         goto out;
620
621                 if (state->state & bits) {
622                         start = state->start;
623                         atomic_inc(&state->refs);
624                         wait_on_state(tree, state);
625                         free_extent_state(state);
626                         goto again;
627                 }
628                 start = state->end + 1;
629
630                 if (start > end)
631                         break;
632
633                 if (need_resched()) {
634                         read_unlock_irq(&tree->lock);
635                         cond_resched();
636                         read_lock_irq(&tree->lock);
637                 }
638         }
639 out:
640         read_unlock_irq(&tree->lock);
641         return 0;
642 }
643 EXPORT_SYMBOL(wait_extent_bit);
644
645 /*
646  * set some bits on a range in the tree.  This may require allocations
647  * or sleeping, so the gfp mask is used to indicate what is allowed.
648  *
649  * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
650  * range already has the desired bits set.  The start of the existing
651  * range is returned in failed_start in this case.
652  *
653  * [start, end] is inclusive
654  * This takes the tree lock.
655  */
656 int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits,
657                    int exclusive, u64 *failed_start, gfp_t mask)
658 {
659         struct extent_state *state;
660         struct extent_state *prealloc = NULL;
661         struct rb_node *node;
662         unsigned long flags;
663         int err = 0;
664         int set;
665         u64 last_start;
666         u64 last_end;
667 again:
668         if (!prealloc && (mask & __GFP_WAIT)) {
669                 prealloc = alloc_extent_state(mask);
670                 if (!prealloc)
671                         return -ENOMEM;
672         }
673
674         write_lock_irqsave(&tree->lock, flags);
675         /*
676          * this search will find all the extents that end after
677          * our range starts.
678          */
679         node = tree_search(&tree->state, start);
680         if (!node) {
681                 err = insert_state(tree, prealloc, start, end, bits);
682                 prealloc = NULL;
683                 BUG_ON(err == -EEXIST);
684                 goto out;
685         }
686
687         state = rb_entry(node, struct extent_state, rb_node);
688         last_start = state->start;
689         last_end = state->end;
690
691         /*
692          * | ---- desired range ---- |
693          * | state |
694          *
695          * Just lock what we found and keep going
696          */
697         if (state->start == start && state->end <= end) {
698                 set = state->state & bits;
699                 if (set && exclusive) {
700                         *failed_start = state->start;
701                         err = -EEXIST;
702                         goto out;
703                 }
704                 state->state |= bits;
705                 start = state->end + 1;
706                 merge_state(tree, state);
707                 goto search_again;
708         }
709
710         /*
711          *     | ---- desired range ---- |
712          * | state |
713          *   or
714          * | ------------- state -------------- |
715          *
716          * We need to split the extent we found, and may flip bits on
717          * second half.
718          *
719          * If the extent we found extends past our
720          * range, we just split and search again.  It'll get split
721          * again the next time though.
722          *
723          * If the extent we found is inside our range, we set the
724          * desired bit on it.
725          */
726         if (state->start < start) {
727                 set = state->state & bits;
728                 if (exclusive && set) {
729                         *failed_start = start;
730                         err = -EEXIST;
731                         goto out;
732                 }
733                 err = split_state(tree, state, prealloc, start);
734                 BUG_ON(err == -EEXIST);
735                 prealloc = NULL;
736                 if (err)
737                         goto out;
738                 if (state->end <= end) {
739                         state->state |= bits;
740                         start = state->end + 1;
741                         merge_state(tree, state);
742                 } else {
743                         start = state->start;
744                 }
745                 goto search_again;
746         }
747         /*
748          * | ---- desired range ---- |
749          *     | state | or               | state |
750          *
751          * There's a hole, we need to insert something in it and
752          * ignore the extent we found.
753          */
754         if (state->start > start) {
755                 u64 this_end;
756                 if (end < last_start)
757                         this_end = end;
758                 else
759                         this_end = last_start -1;
760                 err = insert_state(tree, prealloc, start, this_end,
761                                    bits);
762                 prealloc = NULL;
763                 BUG_ON(err == -EEXIST);
764                 if (err)
765                         goto out;
766                 start = this_end + 1;
767                 goto search_again;
768         }
769         /*
770          * | ---- desired range ---- |
771          *                        | state |
772          * We need to split the extent, and set the bit
773          * on the first half
774          */
775         if (state->start <= end && state->end > end) {
776                 set = state->state & bits;
777                 if (exclusive && set) {
778                         *failed_start = start;
779                         err = -EEXIST;
780                         goto out;
781                 }
782                 err = split_state(tree, state, prealloc, end + 1);
783                 BUG_ON(err == -EEXIST);
784
785                 prealloc->state |= bits;
786                 merge_state(tree, prealloc);
787                 prealloc = NULL;
788                 goto out;
789         }
790
791         goto search_again;
792
793 out:
794         write_unlock_irqrestore(&tree->lock, flags);
795         if (prealloc)
796                 free_extent_state(prealloc);
797
798         return err;
799
800 search_again:
801         if (start > end)
802                 goto out;
803         write_unlock_irqrestore(&tree->lock, flags);
804         if (mask & __GFP_WAIT)
805                 cond_resched();
806         goto again;
807 }
808 EXPORT_SYMBOL(set_extent_bit);
809
810 /* wrappers around set/clear extent bit */
811 int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
812                      gfp_t mask)
813 {
814         return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
815                               mask);
816 }
817 EXPORT_SYMBOL(set_extent_dirty);
818
819 int set_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
820                     int bits, gfp_t mask)
821 {
822         return set_extent_bit(tree, start, end, bits, 0, NULL,
823                               mask);
824 }
825 EXPORT_SYMBOL(set_extent_bits);
826
827 int clear_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
828                       int bits, gfp_t mask)
829 {
830         return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
831 }
832 EXPORT_SYMBOL(clear_extent_bits);
833
834 int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end,
835                      gfp_t mask)
836 {
837         return set_extent_bit(tree, start, end,
838                               EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
839                               mask);
840 }
841 EXPORT_SYMBOL(set_extent_delalloc);
842
843 int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
844                        gfp_t mask)
845 {
846         return clear_extent_bit(tree, start, end,
847                                 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
848 }
849 EXPORT_SYMBOL(clear_extent_dirty);
850
851 int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
852                      gfp_t mask)
853 {
854         return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
855                               mask);
856 }
857 EXPORT_SYMBOL(set_extent_new);
858
859 int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
860                        gfp_t mask)
861 {
862         return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
863 }
864 EXPORT_SYMBOL(clear_extent_new);
865
866 int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
867                         gfp_t mask)
868 {
869         return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
870                               mask);
871 }
872 EXPORT_SYMBOL(set_extent_uptodate);
873
874 int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
875                           gfp_t mask)
876 {
877         return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
878 }
879 EXPORT_SYMBOL(clear_extent_uptodate);
880
881 int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
882                          gfp_t mask)
883 {
884         return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
885                               0, NULL, mask);
886 }
887 EXPORT_SYMBOL(set_extent_writeback);
888
889 int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
890                            gfp_t mask)
891 {
892         return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
893 }
894 EXPORT_SYMBOL(clear_extent_writeback);
895
896 int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end)
897 {
898         return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
899 }
900 EXPORT_SYMBOL(wait_on_extent_writeback);
901
902 /*
903  * locks a range in ascending order, waiting for any locked regions
904  * it hits on the way.  [start,end] are inclusive, and this will sleep.
905  */
906 int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask)
907 {
908         int err;
909         u64 failed_start;
910         while (1) {
911                 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
912                                      &failed_start, mask);
913                 if (err == -EEXIST && (mask & __GFP_WAIT)) {
914                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
915                         start = failed_start;
916                 } else {
917                         break;
918                 }
919                 WARN_ON(start > end);
920         }
921         return err;
922 }
923 EXPORT_SYMBOL(lock_extent);
924
925 int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end,
926                   gfp_t mask)
927 {
928         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
929 }
930 EXPORT_SYMBOL(unlock_extent);
931
932 /*
933  * helper function to set pages and extents in the tree dirty
934  */
935 int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end)
936 {
937         unsigned long index = start >> PAGE_CACHE_SHIFT;
938         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
939         struct page *page;
940
941         while (index <= end_index) {
942                 page = find_get_page(tree->mapping, index);
943                 BUG_ON(!page);
944                 __set_page_dirty_nobuffers(page);
945                 page_cache_release(page);
946                 index++;
947         }
948         set_extent_dirty(tree, start, end, GFP_NOFS);
949         return 0;
950 }
951 EXPORT_SYMBOL(set_range_dirty);
952
953 /*
954  * helper function to set both pages and extents in the tree writeback
955  */
956 int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end)
957 {
958         unsigned long index = start >> PAGE_CACHE_SHIFT;
959         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
960         struct page *page;
961
962         while (index <= end_index) {
963                 page = find_get_page(tree->mapping, index);
964                 BUG_ON(!page);
965                 set_page_writeback(page);
966                 page_cache_release(page);
967                 index++;
968         }
969         set_extent_writeback(tree, start, end, GFP_NOFS);
970         return 0;
971 }
972 EXPORT_SYMBOL(set_range_writeback);
973
974 int find_first_extent_bit(struct extent_map_tree *tree, u64 start,
975                           u64 *start_ret, u64 *end_ret, int bits)
976 {
977         struct rb_node *node;
978         struct extent_state *state;
979         int ret = 1;
980
981         read_lock_irq(&tree->lock);
982         /*
983          * this search will find all the extents that end after
984          * our range starts.
985          */
986         node = tree_search(&tree->state, start);
987         if (!node || IS_ERR(node)) {
988                 goto out;
989         }
990
991         while(1) {
992                 state = rb_entry(node, struct extent_state, rb_node);
993                 if (state->end >= start && (state->state & bits)) {
994                         *start_ret = state->start;
995                         *end_ret = state->end;
996                         ret = 0;
997                         break;
998                 }
999                 node = rb_next(node);
1000                 if (!node)
1001                         break;
1002         }
1003 out:
1004         read_unlock_irq(&tree->lock);
1005         return ret;
1006 }
1007 EXPORT_SYMBOL(find_first_extent_bit);
1008
1009 u64 find_lock_delalloc_range(struct extent_map_tree *tree,
1010                              u64 start, u64 lock_start, u64 *end, u64 max_bytes)
1011 {
1012         struct rb_node *node;
1013         struct extent_state *state;
1014         u64 cur_start = start;
1015         u64 found = 0;
1016         u64 total_bytes = 0;
1017
1018         write_lock_irq(&tree->lock);
1019         /*
1020          * this search will find all the extents that end after
1021          * our range starts.
1022          */
1023 search_again:
1024         node = tree_search(&tree->state, cur_start);
1025         if (!node || IS_ERR(node)) {
1026                 goto out;
1027         }
1028
1029         while(1) {
1030                 state = rb_entry(node, struct extent_state, rb_node);
1031                 if (state->start != cur_start) {
1032                         goto out;
1033                 }
1034                 if (!(state->state & EXTENT_DELALLOC)) {
1035                         goto out;
1036                 }
1037                 if (state->start >= lock_start) {
1038                         if (state->state & EXTENT_LOCKED) {
1039                                 DEFINE_WAIT(wait);
1040                                 atomic_inc(&state->refs);
1041                                 write_unlock_irq(&tree->lock);
1042                                 schedule();
1043                                 write_lock_irq(&tree->lock);
1044                                 finish_wait(&state->wq, &wait);
1045                                 free_extent_state(state);
1046                                 goto search_again;
1047                         }
1048                         state->state |= EXTENT_LOCKED;
1049                 }
1050                 found++;
1051                 *end = state->end;
1052                 cur_start = state->end + 1;
1053                 node = rb_next(node);
1054                 if (!node)
1055                         break;
1056                 total_bytes = state->end - state->start + 1;
1057                 if (total_bytes >= max_bytes)
1058                         break;
1059         }
1060 out:
1061         write_unlock_irq(&tree->lock);
1062         return found;
1063 }
1064
1065 /*
1066  * helper function to lock both pages and extents in the tree.
1067  * pages must be locked first.
1068  */
1069 int lock_range(struct extent_map_tree *tree, u64 start, u64 end)
1070 {
1071         unsigned long index = start >> PAGE_CACHE_SHIFT;
1072         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1073         struct page *page;
1074         int err;
1075
1076         while (index <= end_index) {
1077                 page = grab_cache_page(tree->mapping, index);
1078                 if (!page) {
1079                         err = -ENOMEM;
1080                         goto failed;
1081                 }
1082                 if (IS_ERR(page)) {
1083                         err = PTR_ERR(page);
1084                         goto failed;
1085                 }
1086                 index++;
1087         }
1088         lock_extent(tree, start, end, GFP_NOFS);
1089         return 0;
1090
1091 failed:
1092         /*
1093          * we failed above in getting the page at 'index', so we undo here
1094          * up to but not including the page at 'index'
1095          */
1096         end_index = index;
1097         index = start >> PAGE_CACHE_SHIFT;
1098         while (index < end_index) {
1099                 page = find_get_page(tree->mapping, index);
1100                 unlock_page(page);
1101                 page_cache_release(page);
1102                 index++;
1103         }
1104         return err;
1105 }
1106 EXPORT_SYMBOL(lock_range);
1107
1108 /*
1109  * helper function to unlock both pages and extents in the tree.
1110  */
1111 int unlock_range(struct extent_map_tree *tree, u64 start, u64 end)
1112 {
1113         unsigned long index = start >> PAGE_CACHE_SHIFT;
1114         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1115         struct page *page;
1116
1117         while (index <= end_index) {
1118                 page = find_get_page(tree->mapping, index);
1119                 unlock_page(page);
1120                 page_cache_release(page);
1121                 index++;
1122         }
1123         unlock_extent(tree, start, end, GFP_NOFS);
1124         return 0;
1125 }
1126 EXPORT_SYMBOL(unlock_range);
1127
1128 int set_state_private(struct extent_map_tree *tree, u64 start, u64 private)
1129 {
1130         struct rb_node *node;
1131         struct extent_state *state;
1132         int ret = 0;
1133
1134         write_lock_irq(&tree->lock);
1135         /*
1136          * this search will find all the extents that end after
1137          * our range starts.
1138          */
1139         node = tree_search(&tree->state, start);
1140         if (!node || IS_ERR(node)) {
1141                 ret = -ENOENT;
1142                 goto out;
1143         }
1144         state = rb_entry(node, struct extent_state, rb_node);
1145         if (state->start != start) {
1146                 ret = -ENOENT;
1147                 goto out;
1148         }
1149         state->private = private;
1150 out:
1151         write_unlock_irq(&tree->lock);
1152         return ret;
1153 }
1154
1155 int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private)
1156 {
1157         struct rb_node *node;
1158         struct extent_state *state;
1159         int ret = 0;
1160
1161         read_lock_irq(&tree->lock);
1162         /*
1163          * this search will find all the extents that end after
1164          * our range starts.
1165          */
1166         node = tree_search(&tree->state, start);
1167         if (!node || IS_ERR(node)) {
1168                 ret = -ENOENT;
1169                 goto out;
1170         }
1171         state = rb_entry(node, struct extent_state, rb_node);
1172         if (state->start != start) {
1173                 ret = -ENOENT;
1174                 goto out;
1175         }
1176         *private = state->private;
1177 out:
1178         read_unlock_irq(&tree->lock);
1179         return ret;
1180 }
1181
1182 /*
1183  * searches a range in the state tree for a given mask.
1184  * If 'filled' == 1, this returns 1 only if ever extent in the tree
1185  * has the bits set.  Otherwise, 1 is returned if any bit in the
1186  * range is found set.
1187  */
1188 int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
1189                    int bits, int filled)
1190 {
1191         struct extent_state *state = NULL;
1192         struct rb_node *node;
1193         int bitset = 0;
1194
1195         read_lock_irq(&tree->lock);
1196         node = tree_search(&tree->state, start);
1197         while (node && start <= end) {
1198                 state = rb_entry(node, struct extent_state, rb_node);
1199                 if (state->start > end)
1200                         break;
1201
1202                 if (filled && state->start > start) {
1203                         bitset = 0;
1204                         break;
1205                 }
1206                 if (state->state & bits) {
1207                         bitset = 1;
1208                         if (!filled)
1209                                 break;
1210                 } else if (filled) {
1211                         bitset = 0;
1212                         break;
1213                 }
1214                 start = state->end + 1;
1215                 if (start > end)
1216                         break;
1217                 node = rb_next(node);
1218         }
1219         read_unlock_irq(&tree->lock);
1220         return bitset;
1221 }
1222 EXPORT_SYMBOL(test_range_bit);
1223
1224 /*
1225  * helper function to set a given page up to date if all the
1226  * extents in the tree for that page are up to date
1227  */
1228 static int check_page_uptodate(struct extent_map_tree *tree,
1229                                struct page *page)
1230 {
1231         u64 start = page->index << PAGE_CACHE_SHIFT;
1232         u64 end = start + PAGE_CACHE_SIZE - 1;
1233         if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1234                 SetPageUptodate(page);
1235         return 0;
1236 }
1237
1238 /*
1239  * helper function to unlock a page if all the extents in the tree
1240  * for that page are unlocked
1241  */
1242 static int check_page_locked(struct extent_map_tree *tree,
1243                              struct page *page)
1244 {
1245         u64 start = page->index << PAGE_CACHE_SHIFT;
1246         u64 end = start + PAGE_CACHE_SIZE - 1;
1247         if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1248                 unlock_page(page);
1249         return 0;
1250 }
1251
1252 /*
1253  * helper function to end page writeback if all the extents
1254  * in the tree for that page are done with writeback
1255  */
1256 static int check_page_writeback(struct extent_map_tree *tree,
1257                              struct page *page)
1258 {
1259         u64 start = page->index << PAGE_CACHE_SHIFT;
1260         u64 end = start + PAGE_CACHE_SIZE - 1;
1261         if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1262                 end_page_writeback(page);
1263         return 0;
1264 }
1265
1266 /* lots and lots of room for performance fixes in the end_bio funcs */
1267
1268 /*
1269  * after a writepage IO is done, we need to:
1270  * clear the uptodate bits on error
1271  * clear the writeback bits in the extent tree for this IO
1272  * end_page_writeback if the page has no more pending IO
1273  *
1274  * Scheduling is not allowed, so the extent state tree is expected
1275  * to have one and only one object corresponding to this IO.
1276  */
1277 static int end_bio_extent_writepage(struct bio *bio,
1278                                    unsigned int bytes_done, int err)
1279 {
1280         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1281         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1282         struct extent_map_tree *tree = bio->bi_private;
1283         u64 start;
1284         u64 end;
1285         int whole_page;
1286
1287         if (bio->bi_size)
1288                 return 1;
1289
1290         do {
1291                 struct page *page = bvec->bv_page;
1292                 start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1293                 end = start + bvec->bv_len - 1;
1294
1295                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1296                         whole_page = 1;
1297                 else
1298                         whole_page = 0;
1299
1300                 if (--bvec >= bio->bi_io_vec)
1301                         prefetchw(&bvec->bv_page->flags);
1302
1303                 if (!uptodate) {
1304                         clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1305                         ClearPageUptodate(page);
1306                         SetPageError(page);
1307                 }
1308                 clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1309
1310                 if (whole_page)
1311                         end_page_writeback(page);
1312                 else
1313                         check_page_writeback(tree, page);
1314                 if (tree->ops && tree->ops->writepage_end_io_hook)
1315                         tree->ops->writepage_end_io_hook(page, start, end);
1316         } while (bvec >= bio->bi_io_vec);
1317
1318         bio_put(bio);
1319         return 0;
1320 }
1321
1322 /*
1323  * after a readpage IO is done, we need to:
1324  * clear the uptodate bits on error
1325  * set the uptodate bits if things worked
1326  * set the page up to date if all extents in the tree are uptodate
1327  * clear the lock bit in the extent tree
1328  * unlock the page if there are no other extents locked for it
1329  *
1330  * Scheduling is not allowed, so the extent state tree is expected
1331  * to have one and only one object corresponding to this IO.
1332  */
1333 static int end_bio_extent_readpage(struct bio *bio,
1334                                    unsigned int bytes_done, int err)
1335 {
1336         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1337         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1338         struct extent_map_tree *tree = bio->bi_private;
1339         u64 start;
1340         u64 end;
1341         int whole_page;
1342         int ret;
1343
1344         if (bio->bi_size)
1345                 return 1;
1346
1347         do {
1348                 struct page *page = bvec->bv_page;
1349                 start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1350                 end = start + bvec->bv_len - 1;
1351
1352                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1353                         whole_page = 1;
1354                 else
1355                         whole_page = 0;
1356
1357                 if (--bvec >= bio->bi_io_vec)
1358                         prefetchw(&bvec->bv_page->flags);
1359
1360                 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1361                         ret = tree->ops->readpage_end_io_hook(page, start, end);
1362                         if (ret)
1363                                 uptodate = 0;
1364                 }
1365                 if (uptodate) {
1366                         set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1367                         if (whole_page)
1368                                 SetPageUptodate(page);
1369                         else
1370                                 check_page_uptodate(tree, page);
1371                 } else {
1372                         ClearPageUptodate(page);
1373                         SetPageError(page);
1374                 }
1375
1376                 unlock_extent(tree, start, end, GFP_ATOMIC);
1377
1378                 if (whole_page)
1379                         unlock_page(page);
1380                 else
1381                         check_page_locked(tree, page);
1382         } while (bvec >= bio->bi_io_vec);
1383
1384         bio_put(bio);
1385         return 0;
1386 }
1387
1388 /*
1389  * IO done from prepare_write is pretty simple, we just unlock
1390  * the structs in the extent tree when done, and set the uptodate bits
1391  * as appropriate.
1392  */
1393 static int end_bio_extent_preparewrite(struct bio *bio,
1394                                        unsigned int bytes_done, int err)
1395 {
1396         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1397         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1398         struct extent_map_tree *tree = bio->bi_private;
1399         u64 start;
1400         u64 end;
1401
1402         if (bio->bi_size)
1403                 return 1;
1404
1405         do {
1406                 struct page *page = bvec->bv_page;
1407                 start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1408                 end = start + bvec->bv_len - 1;
1409
1410                 if (--bvec >= bio->bi_io_vec)
1411                         prefetchw(&bvec->bv_page->flags);
1412
1413                 if (uptodate) {
1414                         set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1415                 } else {
1416                         ClearPageUptodate(page);
1417                         SetPageError(page);
1418                 }
1419
1420                 unlock_extent(tree, start, end, GFP_ATOMIC);
1421
1422         } while (bvec >= bio->bi_io_vec);
1423
1424         bio_put(bio);
1425         return 0;
1426 }
1427
1428 static int submit_extent_page(int rw, struct extent_map_tree *tree,
1429                               struct page *page, sector_t sector,
1430                               size_t size, unsigned long offset,
1431                               struct block_device *bdev,
1432                               bio_end_io_t end_io_func)
1433 {
1434         struct bio *bio;
1435         int ret = 0;
1436
1437         bio = bio_alloc(GFP_NOIO, 1);
1438
1439         bio->bi_sector = sector;
1440         bio->bi_bdev = bdev;
1441         bio->bi_io_vec[0].bv_page = page;
1442         bio->bi_io_vec[0].bv_len = size;
1443         bio->bi_io_vec[0].bv_offset = offset;
1444
1445         bio->bi_vcnt = 1;
1446         bio->bi_idx = 0;
1447         bio->bi_size = size;
1448
1449         bio->bi_end_io = end_io_func;
1450         bio->bi_private = tree;
1451
1452         bio_get(bio);
1453         submit_bio(rw, bio);
1454
1455         if (bio_flagged(bio, BIO_EOPNOTSUPP))
1456                 ret = -EOPNOTSUPP;
1457
1458         bio_put(bio);
1459         return ret;
1460 }
1461
1462 void set_page_extent_mapped(struct page *page)
1463 {
1464         if (!PagePrivate(page)) {
1465                 SetPagePrivate(page);
1466                 WARN_ON(!page->mapping->a_ops->invalidatepage);
1467                 set_page_private(page, EXTENT_PAGE_PRIVATE);
1468                 page_cache_get(page);
1469         }
1470 }
1471
1472 /*
1473  * basic readpage implementation.  Locked extent state structs are inserted
1474  * into the tree that are removed when the IO is done (by the end_io
1475  * handlers)
1476  */
1477 int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
1478                           get_extent_t *get_extent)
1479 {
1480         struct inode *inode = page->mapping->host;
1481         u64 start = page->index << PAGE_CACHE_SHIFT;
1482         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1483         u64 end;
1484         u64 cur = start;
1485         u64 extent_offset;
1486         u64 last_byte = i_size_read(inode);
1487         u64 block_start;
1488         u64 cur_end;
1489         sector_t sector;
1490         struct extent_map *em;
1491         struct block_device *bdev;
1492         int ret;
1493         int nr = 0;
1494         size_t page_offset = 0;
1495         size_t iosize;
1496         size_t blocksize = inode->i_sb->s_blocksize;
1497
1498         set_page_extent_mapped(page);
1499
1500         end = page_end;
1501         lock_extent(tree, start, end, GFP_NOFS);
1502
1503         while (cur <= end) {
1504                 if (cur >= last_byte) {
1505                         iosize = PAGE_CACHE_SIZE - page_offset;
1506                         zero_user_page(page, page_offset, iosize, KM_USER0);
1507                         set_extent_uptodate(tree, cur, cur + iosize - 1,
1508                                             GFP_NOFS);
1509                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1510                         break;
1511                 }
1512                 em = get_extent(inode, page, page_offset, cur, end, 0);
1513                 if (IS_ERR(em) || !em) {
1514                         SetPageError(page);
1515                         unlock_extent(tree, cur, end, GFP_NOFS);
1516                         break;
1517                 }
1518
1519                 extent_offset = cur - em->start;
1520                 BUG_ON(em->end < cur);
1521                 BUG_ON(end < cur);
1522
1523                 iosize = min(em->end - cur, end - cur) + 1;
1524                 cur_end = min(em->end, end);
1525                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1526                 sector = (em->block_start + extent_offset) >> 9;
1527                 bdev = em->bdev;
1528                 block_start = em->block_start;
1529                 free_extent_map(em);
1530                 em = NULL;
1531
1532                 /* we've found a hole, just zero and go on */
1533                 if (block_start == EXTENT_MAP_HOLE) {
1534                         zero_user_page(page, page_offset, iosize, KM_USER0);
1535                         set_extent_uptodate(tree, cur, cur + iosize - 1,
1536                                             GFP_NOFS);
1537                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1538                         cur = cur + iosize;
1539                         page_offset += iosize;
1540                         continue;
1541                 }
1542                 /* the get_extent function already copied into the page */
1543                 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
1544                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1545                         cur = cur + iosize;
1546                         page_offset += iosize;
1547                         continue;
1548                 }
1549
1550                 ret = 0;
1551                 if (tree->ops && tree->ops->readpage_io_hook) {
1552                         ret = tree->ops->readpage_io_hook(page, cur,
1553                                                           cur + iosize - 1);
1554                 }
1555                 if (!ret) {
1556                         ret = submit_extent_page(READ, tree, page,
1557                                                  sector, iosize, page_offset,
1558                                                  bdev, end_bio_extent_readpage);
1559                 }
1560                 if (ret)
1561                         SetPageError(page);
1562                 cur = cur + iosize;
1563                 page_offset += iosize;
1564                 nr++;
1565         }
1566         if (!nr) {
1567                 if (!PageError(page))
1568                         SetPageUptodate(page);
1569                 unlock_page(page);
1570         }
1571         return 0;
1572 }
1573 EXPORT_SYMBOL(extent_read_full_page);
1574
1575 /*
1576  * the writepage semantics are similar to regular writepage.  extent
1577  * records are inserted to lock ranges in the tree, and as dirty areas
1578  * are found, they are marked writeback.  Then the lock bits are removed
1579  * and the end_io handler clears the writeback ranges
1580  */
1581 int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
1582                           get_extent_t *get_extent,
1583                           struct writeback_control *wbc)
1584 {
1585         struct inode *inode = page->mapping->host;
1586         u64 start = page->index << PAGE_CACHE_SHIFT;
1587         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1588         u64 end;
1589         u64 cur = start;
1590         u64 extent_offset;
1591         u64 last_byte = i_size_read(inode);
1592         u64 block_start;
1593         sector_t sector;
1594         struct extent_map *em;
1595         struct block_device *bdev;
1596         int ret;
1597         int nr = 0;
1598         size_t page_offset = 0;
1599         size_t iosize;
1600         size_t blocksize;
1601         loff_t i_size = i_size_read(inode);
1602         unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
1603         u64 nr_delalloc;
1604         u64 delalloc_end;
1605
1606         WARN_ON(!PageLocked(page));
1607         if (page->index > end_index) {
1608                 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1609                 unlock_page(page);
1610                 return 0;
1611         }
1612
1613         if (page->index == end_index) {
1614                 size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
1615                 zero_user_page(page, offset,
1616                                PAGE_CACHE_SIZE - offset, KM_USER0);
1617         }
1618
1619         set_page_extent_mapped(page);
1620
1621         lock_extent(tree, start, page_end, GFP_NOFS);
1622         nr_delalloc = find_lock_delalloc_range(tree, start, page_end + 1,
1623                                                &delalloc_end,
1624                                                128 * 1024 * 1024);
1625         if (nr_delalloc) {
1626                 tree->ops->fill_delalloc(inode, start, delalloc_end);
1627                 if (delalloc_end >= page_end + 1) {
1628                         clear_extent_bit(tree, page_end + 1, delalloc_end,
1629                                          EXTENT_LOCKED | EXTENT_DELALLOC,
1630                                          1, 0, GFP_NOFS);
1631                 }
1632                 clear_extent_bit(tree, start, page_end, EXTENT_DELALLOC,
1633                                  0, 0, GFP_NOFS);
1634                 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1635                         printk("found delalloc bits after clear extent_bit\n");
1636                 }
1637         } else if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1638                 printk("found delalloc bits after find_delalloc_range returns 0\n");
1639         }
1640
1641         end = page_end;
1642         if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1643                 printk("found delalloc bits after lock_extent\n");
1644         }
1645
1646         if (last_byte <= start) {
1647                 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1648                 goto done;
1649         }
1650
1651         set_extent_uptodate(tree, start, page_end, GFP_NOFS);
1652         blocksize = inode->i_sb->s_blocksize;
1653
1654         while (cur <= end) {
1655                 if (cur >= last_byte) {
1656                         clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
1657                         break;
1658                 }
1659                 em = get_extent(inode, page, page_offset, cur, end, 0);
1660                 if (IS_ERR(em) || !em) {
1661                         SetPageError(page);
1662                         break;
1663                 }
1664
1665                 extent_offset = cur - em->start;
1666                 BUG_ON(em->end < cur);
1667                 BUG_ON(end < cur);
1668                 iosize = min(em->end - cur, end - cur) + 1;
1669                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1670                 sector = (em->block_start + extent_offset) >> 9;
1671                 bdev = em->bdev;
1672                 block_start = em->block_start;
1673                 free_extent_map(em);
1674                 em = NULL;
1675
1676                 if (block_start == EXTENT_MAP_HOLE ||
1677                     block_start == EXTENT_MAP_INLINE) {
1678                         clear_extent_dirty(tree, cur,
1679                                            cur + iosize - 1, GFP_NOFS);
1680                         cur = cur + iosize;
1681                         page_offset += iosize;
1682                         continue;
1683                 }
1684
1685                 /* leave this out until we have a page_mkwrite call */
1686                 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
1687                                    EXTENT_DIRTY, 0)) {
1688                         cur = cur + iosize;
1689                         page_offset += iosize;
1690                         continue;
1691                 }
1692                 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
1693                 if (tree->ops && tree->ops->writepage_io_hook) {
1694                         ret = tree->ops->writepage_io_hook(page, cur,
1695                                                 cur + iosize - 1);
1696                 } else {
1697                         ret = 0;
1698                 }
1699                 if (ret)
1700                         SetPageError(page);
1701                 else {
1702                         set_range_writeback(tree, cur, cur + iosize - 1);
1703                         ret = submit_extent_page(WRITE, tree, page, sector,
1704                                                  iosize, page_offset, bdev,
1705                                                  end_bio_extent_writepage);
1706                         if (ret)
1707                                 SetPageError(page);
1708                 }
1709                 cur = cur + iosize;
1710                 page_offset += iosize;
1711                 nr++;
1712         }
1713 done:
1714         unlock_extent(tree, start, page_end, GFP_NOFS);
1715         unlock_page(page);
1716         return 0;
1717 }
1718 EXPORT_SYMBOL(extent_write_full_page);
1719
1720 /*
1721  * basic invalidatepage code, this waits on any locked or writeback
1722  * ranges corresponding to the page, and then deletes any extent state
1723  * records from the tree
1724  */
1725 int extent_invalidatepage(struct extent_map_tree *tree,
1726                           struct page *page, unsigned long offset)
1727 {
1728         u64 start = (page->index << PAGE_CACHE_SHIFT);
1729         u64 end = start + PAGE_CACHE_SIZE - 1;
1730         size_t blocksize = page->mapping->host->i_sb->s_blocksize;
1731
1732         start += (offset + blocksize -1) & ~(blocksize - 1);
1733         if (start > end)
1734                 return 0;
1735
1736         lock_extent(tree, start, end, GFP_NOFS);
1737         wait_on_extent_writeback(tree, start, end);
1738         clear_extent_bit(tree, start, end,
1739                          EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
1740                          1, 1, GFP_NOFS);
1741         return 0;
1742 }
1743 EXPORT_SYMBOL(extent_invalidatepage);
1744
1745 /*
1746  * simple commit_write call, set_range_dirty is used to mark both
1747  * the pages and the extent records as dirty
1748  */
1749 int extent_commit_write(struct extent_map_tree *tree,
1750                         struct inode *inode, struct page *page,
1751                         unsigned from, unsigned to)
1752 {
1753         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1754
1755         set_page_extent_mapped(page);
1756         set_page_dirty(page);
1757
1758         if (pos > inode->i_size) {
1759                 i_size_write(inode, pos);
1760                 mark_inode_dirty(inode);
1761         }
1762         return 0;
1763 }
1764 EXPORT_SYMBOL(extent_commit_write);
1765
1766 int extent_prepare_write(struct extent_map_tree *tree,
1767                          struct inode *inode, struct page *page,
1768                          unsigned from, unsigned to, get_extent_t *get_extent)
1769 {
1770         u64 page_start = page->index << PAGE_CACHE_SHIFT;
1771         u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
1772         u64 block_start;
1773         u64 orig_block_start;
1774         u64 block_end;
1775         u64 cur_end;
1776         struct extent_map *em;
1777         unsigned blocksize = 1 << inode->i_blkbits;
1778         size_t page_offset = 0;
1779         size_t block_off_start;
1780         size_t block_off_end;
1781         int err = 0;
1782         int iocount = 0;
1783         int ret = 0;
1784         int isnew;
1785
1786         set_page_extent_mapped(page);
1787
1788         block_start = (page_start + from) & ~((u64)blocksize - 1);
1789         block_end = (page_start + to - 1) | (blocksize - 1);
1790         orig_block_start = block_start;
1791
1792         lock_extent(tree, page_start, page_end, GFP_NOFS);
1793         while(block_start <= block_end) {
1794                 em = get_extent(inode, page, page_offset, block_start,
1795                                 block_end, 1);
1796                 if (IS_ERR(em) || !em) {
1797                         goto err;
1798                 }
1799                 cur_end = min(block_end, em->end);
1800                 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
1801                 block_off_end = block_off_start + blocksize;
1802                 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
1803
1804                 if (!PageUptodate(page) && isnew &&
1805                     (block_off_end > to || block_off_start < from)) {
1806                         void *kaddr;
1807
1808                         kaddr = kmap_atomic(page, KM_USER0);
1809                         if (block_off_end > to)
1810                                 memset(kaddr + to, 0, block_off_end - to);
1811                         if (block_off_start < from)
1812                                 memset(kaddr + block_off_start, 0,
1813                                        from - block_off_start);
1814                         flush_dcache_page(page);
1815                         kunmap_atomic(kaddr, KM_USER0);
1816                 }
1817                 if (!isnew && !PageUptodate(page) &&
1818                     (block_off_end > to || block_off_start < from) &&
1819                     !test_range_bit(tree, block_start, cur_end,
1820                                     EXTENT_UPTODATE, 1)) {
1821                         u64 sector;
1822                         u64 extent_offset = block_start - em->start;
1823                         size_t iosize;
1824                         sector = (em->block_start + extent_offset) >> 9;
1825                         iosize = (cur_end - block_start + blocksize - 1) &
1826                                 ~((u64)blocksize - 1);
1827                         /*
1828                          * we've already got the extent locked, but we
1829                          * need to split the state such that our end_bio
1830                          * handler can clear the lock.
1831                          */
1832                         set_extent_bit(tree, block_start,
1833                                        block_start + iosize - 1,
1834                                        EXTENT_LOCKED, 0, NULL, GFP_NOFS);
1835                         ret = submit_extent_page(READ, tree, page,
1836                                          sector, iosize, page_offset, em->bdev,
1837                                          end_bio_extent_preparewrite);
1838                         iocount++;
1839                         block_start = block_start + iosize;
1840                 } else {
1841                         set_extent_uptodate(tree, block_start, cur_end,
1842                                             GFP_NOFS);
1843                         unlock_extent(tree, block_start, cur_end, GFP_NOFS);
1844                         block_start = cur_end + 1;
1845                 }
1846                 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
1847                 free_extent_map(em);
1848         }
1849         if (iocount) {
1850                 wait_extent_bit(tree, orig_block_start,
1851                                 block_end, EXTENT_LOCKED);
1852         }
1853         check_page_uptodate(tree, page);
1854 err:
1855         /* FIXME, zero out newly allocated blocks on error */
1856         return err;
1857 }
1858 EXPORT_SYMBOL(extent_prepare_write);
1859
1860 /*
1861  * a helper for releasepage.  As long as there are no locked extents
1862  * in the range corresponding to the page, both state records and extent
1863  * map records are removed
1864  */
1865 int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page)
1866 {
1867         struct extent_map *em;
1868         u64 start = page->index << PAGE_CACHE_SHIFT;
1869         u64 end = start + PAGE_CACHE_SIZE - 1;
1870         u64 orig_start = start;
1871         int ret = 1;
1872
1873         while (start <= end) {
1874                 em = lookup_extent_mapping(tree, start, end);
1875                 if (!em || IS_ERR(em))
1876                         break;
1877                 if (!test_range_bit(tree, em->start, em->end,
1878                                     EXTENT_LOCKED, 0)) {
1879                         remove_extent_mapping(tree, em);
1880                         /* once for the rb tree */
1881                         free_extent_map(em);
1882                 }
1883                 start = em->end + 1;
1884                 /* once for us */
1885                 free_extent_map(em);
1886         }
1887         if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0))
1888                 ret = 0;
1889         else
1890                 clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
1891                                  1, 1, GFP_NOFS);
1892         return ret;
1893 }
1894 EXPORT_SYMBOL(try_release_extent_mapping);
1895
1896 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
1897                 get_extent_t *get_extent)
1898 {
1899         struct inode *inode = mapping->host;
1900         u64 start = iblock << inode->i_blkbits;
1901         u64 end = start + (1 << inode->i_blkbits) - 1;
1902         struct extent_map *em;
1903
1904         em = get_extent(inode, NULL, 0, start, end, 0);
1905         if (!em || IS_ERR(em))
1906                 return 0;
1907
1908         if (em->block_start == EXTENT_MAP_INLINE ||
1909             em->block_start == EXTENT_MAP_HOLE)
1910                 return 0;
1911
1912         return (em->block_start + start - em->start) >> inode->i_blkbits;
1913 }
1914
1915 static int add_lru(struct extent_map_tree *tree, struct extent_buffer *eb)
1916 {
1917         if (list_empty(&eb->lru)) {
1918                 extent_buffer_get(eb);
1919                 list_add(&eb->lru, &tree->buffer_lru);
1920                 tree->lru_size++;
1921                 if (tree->lru_size >= BUFFER_LRU_MAX) {
1922                         struct extent_buffer *rm;
1923                         rm = list_entry(tree->buffer_lru.prev,
1924                                         struct extent_buffer, lru);
1925                         tree->lru_size--;
1926                         list_del(&rm->lru);
1927                         free_extent_buffer(rm);
1928                 }
1929         } else
1930                 list_move(&eb->lru, &tree->buffer_lru);
1931         return 0;
1932 }
1933 static struct extent_buffer *find_lru(struct extent_map_tree *tree,
1934                                       u64 start, unsigned long len)
1935 {
1936         struct list_head *lru = &tree->buffer_lru;
1937         struct list_head *cur = lru->next;
1938         struct extent_buffer *eb;
1939
1940         if (list_empty(lru))
1941                 return NULL;
1942
1943         do {
1944                 eb = list_entry(cur, struct extent_buffer, lru);
1945                 if (eb->start == start && eb->len == len) {
1946                         extent_buffer_get(eb);
1947                         return eb;
1948                 }
1949                 cur = cur->next;
1950         } while (cur != lru);
1951         return NULL;
1952 }
1953
1954 static inline unsigned long num_extent_pages(u64 start, u64 len)
1955 {
1956         return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
1957                 (start >> PAGE_CACHE_SHIFT);
1958 }
1959
1960 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
1961                                               unsigned long i)
1962 {
1963         struct page *p;
1964
1965         if (i == 0)
1966                 return eb->first_page;
1967         i += eb->start >> PAGE_CACHE_SHIFT;
1968         p = find_get_page(eb->first_page->mapping, i);
1969         page_cache_release(p);
1970         return p;
1971 }
1972
1973 static struct extent_buffer *__alloc_extent_buffer(struct extent_map_tree *tree,
1974                                                    u64 start,
1975                                                    unsigned long len,
1976                                                    gfp_t mask)
1977 {
1978         struct extent_buffer *eb = NULL;
1979
1980         spin_lock(&tree->lru_lock);
1981         eb = find_lru(tree, start, len);
1982         if (eb) {
1983                 goto lru_add;
1984         }
1985         spin_unlock(&tree->lru_lock);
1986
1987         if (eb) {
1988                 memset(eb, 0, sizeof(*eb));
1989         } else {
1990                 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
1991         }
1992         INIT_LIST_HEAD(&eb->lru);
1993         eb->start = start;
1994         eb->len = len;
1995         atomic_set(&eb->refs, 1);
1996
1997         spin_lock(&tree->lru_lock);
1998 lru_add:
1999         add_lru(tree, eb);
2000         spin_unlock(&tree->lru_lock);
2001         return eb;
2002 }
2003
2004 static void __free_extent_buffer(struct extent_buffer *eb)
2005 {
2006         kmem_cache_free(extent_buffer_cache, eb);
2007 }
2008
2009 struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
2010                                           u64 start, unsigned long len,
2011                                           struct page *page0,
2012                                           gfp_t mask)
2013 {
2014         unsigned long num_pages = num_extent_pages(start, len);
2015         unsigned long i;
2016         unsigned long index = start >> PAGE_CACHE_SHIFT;
2017         struct extent_buffer *eb;
2018         struct page *p;
2019         struct address_space *mapping = tree->mapping;
2020         int uptodate = 0;
2021
2022         eb = __alloc_extent_buffer(tree, start, len, mask);
2023         if (!eb || IS_ERR(eb))
2024                 return NULL;
2025
2026         if (eb->flags & EXTENT_BUFFER_FILLED)
2027                 return eb;
2028
2029         if (page0) {
2030                 eb->first_page = page0;
2031                 i = 1;
2032                 index++;
2033                 page_cache_get(page0);
2034                 mark_page_accessed(page0);
2035                 set_page_extent_mapped(page0);
2036                 set_page_private(page0, EXTENT_PAGE_PRIVATE_FIRST_PAGE |
2037                                  len << 2);
2038         } else {
2039                 i = 0;
2040         }
2041         for (; i < num_pages; i++, index++) {
2042                 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2043                 if (!p) {
2044                         WARN_ON(1);
2045                         /* make sure the free only frees the pages we've
2046                          * grabbed a reference on
2047                          */
2048                         eb->len = i << PAGE_CACHE_SHIFT;
2049                         eb->start &= ~((u64)PAGE_CACHE_SIZE - 1);
2050                         goto fail;
2051                 }
2052                 set_page_extent_mapped(p);
2053                 mark_page_accessed(p);
2054                 if (i == 0) {
2055                         eb->first_page = p;
2056                         set_page_private(p, EXTENT_PAGE_PRIVATE_FIRST_PAGE |
2057                                          len << 2);
2058                 } else {
2059                         set_page_private(p, EXTENT_PAGE_PRIVATE);
2060                 }
2061                 if (!PageUptodate(p))
2062                         uptodate = 0;
2063                 unlock_page(p);
2064         }
2065         if (uptodate)
2066                 eb->flags |= EXTENT_UPTODATE;
2067         eb->flags |= EXTENT_BUFFER_FILLED;
2068         return eb;
2069 fail:
2070         free_extent_buffer(eb);
2071         return NULL;
2072 }
2073 EXPORT_SYMBOL(alloc_extent_buffer);
2074
2075 struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
2076                                          u64 start, unsigned long len,
2077                                           gfp_t mask)
2078 {
2079         unsigned long num_pages = num_extent_pages(start, len);
2080         unsigned long i; unsigned long index = start >> PAGE_CACHE_SHIFT;
2081         struct extent_buffer *eb;
2082         struct page *p;
2083         struct address_space *mapping = tree->mapping;
2084         int uptodate = 1;
2085
2086         eb = __alloc_extent_buffer(tree, start, len, mask);
2087         if (!eb || IS_ERR(eb))
2088                 return NULL;
2089
2090         if (eb->flags & EXTENT_BUFFER_FILLED)
2091                 return eb;
2092
2093         for (i = 0; i < num_pages; i++, index++) {
2094                 p = find_lock_page(mapping, index);
2095                 if (!p) {
2096                         /* make sure the free only frees the pages we've
2097                          * grabbed a reference on
2098                          */
2099                         eb->len = i << PAGE_CACHE_SHIFT;
2100                         eb->start &= ~((u64)PAGE_CACHE_SIZE - 1);
2101                         goto fail;
2102                 }
2103                 set_page_extent_mapped(p);
2104                 mark_page_accessed(p);
2105
2106                 if (i == 0) {
2107                         eb->first_page = p;
2108                         set_page_private(p, EXTENT_PAGE_PRIVATE_FIRST_PAGE |
2109                                          len << 2);
2110                 } else {
2111                         set_page_private(p, EXTENT_PAGE_PRIVATE);
2112                 }
2113
2114                 if (!PageUptodate(p))
2115                         uptodate = 0;
2116                 unlock_page(p);
2117         }
2118         if (uptodate)
2119                 eb->flags |= EXTENT_UPTODATE;
2120         eb->flags |= EXTENT_BUFFER_FILLED;
2121         return eb;
2122 fail:
2123         free_extent_buffer(eb);
2124         return NULL;
2125 }
2126 EXPORT_SYMBOL(find_extent_buffer);
2127
2128 void free_extent_buffer(struct extent_buffer *eb)
2129 {
2130         unsigned long i;
2131         unsigned long num_pages;
2132
2133         if (!eb)
2134                 return;
2135
2136         if (!atomic_dec_and_test(&eb->refs))
2137                 return;
2138
2139         num_pages = num_extent_pages(eb->start, eb->len);
2140
2141         for (i = 0; i < num_pages; i++) {
2142                 page_cache_release(extent_buffer_page(eb, i));
2143         }
2144         __free_extent_buffer(eb);
2145 }
2146 EXPORT_SYMBOL(free_extent_buffer);
2147
2148 int clear_extent_buffer_dirty(struct extent_map_tree *tree,
2149                               struct extent_buffer *eb)
2150 {
2151         int set;
2152         unsigned long i;
2153         unsigned long num_pages;
2154         struct page *page;
2155
2156         u64 start = eb->start;
2157         u64 end = start + eb->len - 1;
2158
2159         set = clear_extent_dirty(tree, start, end, GFP_NOFS);
2160         num_pages = num_extent_pages(eb->start, eb->len);
2161
2162         for (i = 0; i < num_pages; i++) {
2163                 page = extent_buffer_page(eb, i);
2164                 lock_page(page);
2165                 /*
2166                  * if we're on the last page or the first page and the
2167                  * block isn't aligned on a page boundary, do extra checks
2168                  * to make sure we don't clean page that is partially dirty
2169                  */
2170                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2171                     ((i == num_pages - 1) &&
2172                      ((eb->start + eb->len - 1) & (PAGE_CACHE_SIZE - 1)))) {
2173                         start = page->index << PAGE_CACHE_SHIFT;
2174                         end  = start + PAGE_CACHE_SIZE - 1;
2175                         if (test_range_bit(tree, start, end,
2176                                            EXTENT_DIRTY, 0)) {
2177                                 unlock_page(page);
2178                                 continue;
2179                         }
2180                 }
2181                 clear_page_dirty_for_io(page);
2182                 unlock_page(page);
2183         }
2184         return 0;
2185 }
2186 EXPORT_SYMBOL(clear_extent_buffer_dirty);
2187
2188 int wait_on_extent_buffer_writeback(struct extent_map_tree *tree,
2189                                     struct extent_buffer *eb)
2190 {
2191         return wait_on_extent_writeback(tree, eb->start,
2192                                         eb->start + eb->len - 1);
2193 }
2194 EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
2195
2196 int set_extent_buffer_dirty(struct extent_map_tree *tree,
2197                              struct extent_buffer *eb)
2198 {
2199         unsigned long i;
2200         unsigned long num_pages;
2201
2202         num_pages = num_extent_pages(eb->start, eb->len);
2203         for (i = 0; i < num_pages; i++) {
2204                 struct page *page = extent_buffer_page(eb, i);
2205                 /* writepage may need to do something special for the
2206                  * first page, we have to make sure page->private is
2207                  * properly set.  releasepage may drop page->private
2208                  * on us if the page isn't already dirty.
2209                  */
2210                 if (i == 0) {
2211                         lock_page(page);
2212                         set_page_private(page,
2213                                          EXTENT_PAGE_PRIVATE_FIRST_PAGE |
2214                                          eb->len << 2);
2215                 }
2216                 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
2217                 if (i == 0)
2218                         unlock_page(page);
2219         }
2220         return set_extent_dirty(tree, eb->start,
2221                                 eb->start + eb->len - 1, GFP_NOFS);
2222 }
2223 EXPORT_SYMBOL(set_extent_buffer_dirty);
2224
2225 int set_extent_buffer_uptodate(struct extent_map_tree *tree,
2226                                 struct extent_buffer *eb)
2227 {
2228         unsigned long i;
2229         struct page *page;
2230         unsigned long num_pages;
2231
2232         num_pages = num_extent_pages(eb->start, eb->len);
2233
2234         set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2235                             GFP_NOFS);
2236         for (i = 0; i < num_pages; i++) {
2237                 page = extent_buffer_page(eb, i);
2238                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2239                     ((i == num_pages - 1) &&
2240                      ((eb->start + eb->len - 1) & (PAGE_CACHE_SIZE - 1)))) {
2241                         check_page_uptodate(tree, page);
2242                         continue;
2243                 }
2244                 SetPageUptodate(page);
2245         }
2246         return 0;
2247 }
2248 EXPORT_SYMBOL(set_extent_buffer_uptodate);
2249
2250 int extent_buffer_uptodate(struct extent_map_tree *tree,
2251                              struct extent_buffer *eb)
2252 {
2253         if (eb->flags & EXTENT_UPTODATE)
2254                 return 1;
2255         return test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2256                            EXTENT_UPTODATE, 1);
2257 }
2258 EXPORT_SYMBOL(extent_buffer_uptodate);
2259
2260 int read_extent_buffer_pages(struct extent_map_tree *tree,
2261                              struct extent_buffer *eb,
2262                              u64 start,
2263                              int wait)
2264 {
2265         unsigned long i;
2266         unsigned long start_i;
2267         struct page *page;
2268         int err;
2269         int ret = 0;
2270         unsigned long num_pages;
2271
2272         if (eb->flags & EXTENT_UPTODATE)
2273                 return 0;
2274
2275         if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2276                            EXTENT_UPTODATE, 1)) {
2277                 return 0;
2278         }
2279         if (start) {
2280                 WARN_ON(start < eb->start);
2281                 start_i = (start >> PAGE_CACHE_SHIFT) -
2282                         (eb->start >> PAGE_CACHE_SHIFT);
2283         } else {
2284                 start_i = 0;
2285         }
2286
2287         num_pages = num_extent_pages(eb->start, eb->len);
2288         for (i = start_i; i < num_pages; i++) {
2289                 page = extent_buffer_page(eb, i);
2290                 if (PageUptodate(page)) {
2291                         continue;
2292                 }
2293                 if (!wait) {
2294                         if (TestSetPageLocked(page)) {
2295                                 continue;
2296                         }
2297                 } else {
2298                         lock_page(page);
2299                 }
2300                 if (!PageUptodate(page)) {
2301                         err = page->mapping->a_ops->readpage(NULL, page);
2302                         if (err) {
2303                                 ret = err;
2304                         }
2305                 } else {
2306                         unlock_page(page);
2307                 }
2308         }
2309
2310         if (ret || !wait) {
2311                 return ret;
2312         }
2313
2314         for (i = start_i; i < num_pages; i++) {
2315                 page = extent_buffer_page(eb, i);
2316                 wait_on_page_locked(page);
2317                 if (!PageUptodate(page)) {
2318                         ret = -EIO;
2319                 }
2320         }
2321         if (!ret)
2322                 eb->flags |= EXTENT_UPTODATE;
2323         return ret;
2324 }
2325 EXPORT_SYMBOL(read_extent_buffer_pages);
2326
2327 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
2328                         unsigned long start,
2329                         unsigned long len)
2330 {
2331         size_t cur;
2332         size_t offset;
2333         struct page *page;
2334         char *kaddr;
2335         char *dst = (char *)dstv;
2336         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2337         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2338         unsigned long num_pages = num_extent_pages(eb->start, eb->len);
2339
2340         WARN_ON(start > eb->len);
2341         WARN_ON(start + len > eb->start + eb->len);
2342
2343         offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
2344         if (i == 0)
2345                 offset += start_offset;
2346
2347         while(len > 0) {
2348                 page = extent_buffer_page(eb, i);
2349                 if (!PageUptodate(page)) {
2350                         printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len);
2351                         WARN_ON(1);
2352                 }
2353                 WARN_ON(!PageUptodate(page));
2354
2355                 cur = min(len, (PAGE_CACHE_SIZE - offset));
2356                 kaddr = kmap_atomic(page, KM_USER0);
2357                 memcpy(dst, kaddr + offset, cur);
2358                 kunmap_atomic(kaddr, KM_USER0);
2359
2360                 dst += cur;
2361                 len -= cur;
2362                 offset = 0;
2363                 i++;
2364         }
2365 }
2366 EXPORT_SYMBOL(read_extent_buffer);
2367
2368 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
2369                                unsigned long min_len, char **token, char **map,
2370                                unsigned long *map_start,
2371                                unsigned long *map_len, int km)
2372 {
2373         size_t offset = start & (PAGE_CACHE_SIZE - 1);
2374         char *kaddr;
2375         struct page *p;
2376         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2377         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2378         unsigned long end_i = (start_offset + start + min_len) >>
2379                 PAGE_CACHE_SHIFT;
2380
2381         if (i != end_i)
2382                 return -EINVAL;
2383
2384         if (i == 0) {
2385                 offset = start_offset;
2386                 *map_start = 0;
2387         } else {
2388                 offset = 0;
2389                 *map_start = (i << PAGE_CACHE_SHIFT) - start_offset;
2390         }
2391         if (start + min_len >= eb->len) {
2392 printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
2393                 WARN_ON(1);
2394         }
2395
2396         p = extent_buffer_page(eb, i);
2397         WARN_ON(!PageUptodate(p));
2398         kaddr = kmap_atomic(p, km);
2399         *token = kaddr;
2400         *map = kaddr + offset;
2401         *map_len = PAGE_CACHE_SIZE - offset;
2402         return 0;
2403 }
2404 EXPORT_SYMBOL(map_private_extent_buffer);
2405
2406 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
2407                       unsigned long min_len,
2408                       char **token, char **map,
2409                       unsigned long *map_start,
2410                       unsigned long *map_len, int km)
2411 {
2412         int err;
2413         int save = 0;
2414         if (eb->map_token) {
2415                 unmap_extent_buffer(eb, eb->map_token, km);
2416                 eb->map_token = NULL;
2417                 save = 1;
2418         }
2419         err = map_private_extent_buffer(eb, start, min_len, token, map,
2420                                        map_start, map_len, km);
2421         if (!err && save) {
2422                 eb->map_token = *token;
2423                 eb->kaddr = *map;
2424                 eb->map_start = *map_start;
2425                 eb->map_len = *map_len;
2426         }
2427         return err;
2428 }
2429 EXPORT_SYMBOL(map_extent_buffer);
2430
2431 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
2432 {
2433         kunmap_atomic(token, km);
2434 }
2435 EXPORT_SYMBOL(unmap_extent_buffer);
2436
2437 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
2438                           unsigned long start,
2439                           unsigned long len)
2440 {
2441         size_t cur;
2442         size_t offset;
2443         struct page *page;
2444         char *kaddr;
2445         char *ptr = (char *)ptrv;
2446         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2447         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2448         int ret = 0;
2449
2450         WARN_ON(start > eb->len);
2451         WARN_ON(start + len > eb->start + eb->len);
2452
2453         offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
2454         if (i == 0)
2455                 offset += start_offset;
2456
2457         while(len > 0) {
2458                 page = extent_buffer_page(eb, i);
2459                 WARN_ON(!PageUptodate(page));
2460
2461                 cur = min(len, (PAGE_CACHE_SIZE - offset));
2462
2463                 kaddr = kmap_atomic(page, KM_USER0);
2464                 ret = memcmp(ptr, kaddr + offset, cur);
2465                 kunmap_atomic(kaddr, KM_USER0);
2466                 if (ret)
2467                         break;
2468
2469                 ptr += cur;
2470                 len -= cur;
2471                 offset = 0;
2472                 i++;
2473         }
2474         return ret;
2475 }
2476 EXPORT_SYMBOL(memcmp_extent_buffer);
2477
2478 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
2479                          unsigned long start, unsigned long len)
2480 {
2481         size_t cur;
2482         size_t offset;
2483         struct page *page;
2484         char *kaddr;
2485         char *src = (char *)srcv;
2486         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2487         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2488
2489         WARN_ON(start > eb->len);
2490         WARN_ON(start + len > eb->start + eb->len);
2491
2492         offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
2493         if (i == 0)
2494                 offset += start_offset;
2495
2496         while(len > 0) {
2497                 page = extent_buffer_page(eb, i);
2498                 WARN_ON(!PageUptodate(page));
2499
2500                 cur = min(len, PAGE_CACHE_SIZE - offset);
2501                 kaddr = kmap_atomic(page, KM_USER0);
2502                 memcpy(kaddr + offset, src, cur);
2503                 kunmap_atomic(kaddr, KM_USER0);
2504
2505                 src += cur;
2506                 len -= cur;
2507                 offset = 0;
2508                 i++;
2509         }
2510 }
2511 EXPORT_SYMBOL(write_extent_buffer);
2512
2513 void memset_extent_buffer(struct extent_buffer *eb, char c,
2514                           unsigned long start, unsigned long len)
2515 {
2516         size_t cur;
2517         size_t offset;
2518         struct page *page;
2519         char *kaddr;
2520         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2521         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2522
2523         WARN_ON(start > eb->len);
2524         WARN_ON(start + len > eb->start + eb->len);
2525
2526         offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
2527         if (i == 0)
2528                 offset += start_offset;
2529
2530         while(len > 0) {
2531                 page = extent_buffer_page(eb, i);
2532                 WARN_ON(!PageUptodate(page));
2533
2534                 cur = min(len, PAGE_CACHE_SIZE - offset);
2535                 kaddr = kmap_atomic(page, KM_USER0);
2536                 memset(kaddr + offset, c, cur);
2537                 kunmap_atomic(kaddr, KM_USER0);
2538
2539                 len -= cur;
2540                 offset = 0;
2541                 i++;
2542         }
2543 }
2544 EXPORT_SYMBOL(memset_extent_buffer);
2545
2546 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
2547                         unsigned long dst_offset, unsigned long src_offset,
2548                         unsigned long len)
2549 {
2550         u64 dst_len = dst->len;
2551         size_t cur;
2552         size_t offset;
2553         struct page *page;
2554         char *kaddr;
2555         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
2556         unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
2557
2558         WARN_ON(src->len != dst_len);
2559
2560         offset = dst_offset & ((unsigned long)PAGE_CACHE_SIZE - 1);
2561         if (i == 0)
2562                 offset += start_offset;
2563
2564         while(len > 0) {
2565                 page = extent_buffer_page(dst, i);
2566                 WARN_ON(!PageUptodate(page));
2567
2568                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
2569
2570                 kaddr = kmap_atomic(page, KM_USER1);
2571                 read_extent_buffer(src, kaddr + offset, src_offset, cur);
2572                 kunmap_atomic(kaddr, KM_USER1);
2573
2574                 src_offset += cur;
2575                 len -= cur;
2576                 offset = 0;
2577                 i++;
2578         }
2579 }
2580 EXPORT_SYMBOL(copy_extent_buffer);
2581
2582 static void move_pages(struct page *dst_page, struct page *src_page,
2583                        unsigned long dst_off, unsigned long src_off,
2584                        unsigned long len)
2585 {
2586         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
2587         if (dst_page == src_page) {
2588                 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
2589         } else {
2590                 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
2591                 char *p = dst_kaddr + dst_off + len;
2592                 char *s = src_kaddr + src_off + len;
2593
2594                 while (len--)
2595                         *--p = *--s;
2596
2597                 kunmap_atomic(src_kaddr, KM_USER1);
2598         }
2599         kunmap_atomic(dst_kaddr, KM_USER0);
2600 }
2601
2602 static void copy_pages(struct page *dst_page, struct page *src_page,
2603                        unsigned long dst_off, unsigned long src_off,
2604                        unsigned long len)
2605 {
2606         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
2607         char *src_kaddr;
2608
2609         if (dst_page != src_page)
2610                 src_kaddr = kmap_atomic(src_page, KM_USER1);
2611         else
2612                 src_kaddr = dst_kaddr;
2613
2614         memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
2615         kunmap_atomic(dst_kaddr, KM_USER0);
2616         if (dst_page != src_page)
2617                 kunmap_atomic(src_kaddr, KM_USER1);
2618 }
2619
2620 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
2621                            unsigned long src_offset, unsigned long len)
2622 {
2623         size_t cur;
2624         size_t dst_off_in_page;
2625         size_t src_off_in_page;
2626         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
2627         unsigned long dst_i;
2628         unsigned long src_i;
2629
2630         if (src_offset + len > dst->len) {
2631                 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
2632                        src_offset, len, dst->len);
2633                 BUG_ON(1);
2634         }
2635         if (dst_offset + len > dst->len) {
2636                 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
2637                        dst_offset, len, dst->len);
2638                 BUG_ON(1);
2639         }
2640
2641         while(len > 0) {
2642                 dst_off_in_page = dst_offset &
2643                         ((unsigned long)PAGE_CACHE_SIZE - 1);
2644                 src_off_in_page = src_offset &
2645                         ((unsigned long)PAGE_CACHE_SIZE - 1);
2646
2647                 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
2648                 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
2649
2650                 if (src_i == 0)
2651                         src_off_in_page += start_offset;
2652                 if (dst_i == 0)
2653                         dst_off_in_page += start_offset;
2654
2655                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
2656                                                src_off_in_page));
2657                 cur = min_t(unsigned long, cur,
2658                         (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
2659
2660                 copy_pages(extent_buffer_page(dst, dst_i),
2661                            extent_buffer_page(dst, src_i),
2662                            dst_off_in_page, src_off_in_page, cur);
2663
2664                 src_offset += cur;
2665                 dst_offset += cur;
2666                 len -= cur;
2667         }
2668 }
2669 EXPORT_SYMBOL(memcpy_extent_buffer);
2670
2671 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
2672                            unsigned long src_offset, unsigned long len)
2673 {
2674         size_t cur;
2675         size_t dst_off_in_page;
2676         size_t src_off_in_page;
2677         unsigned long dst_end = dst_offset + len - 1;
2678         unsigned long src_end = src_offset + len - 1;
2679         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
2680         unsigned long dst_i;
2681         unsigned long src_i;
2682
2683         if (src_offset + len > dst->len) {
2684                 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
2685                        src_offset, len, dst->len);
2686                 BUG_ON(1);
2687         }
2688         if (dst_offset + len > dst->len) {
2689                 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
2690                        dst_offset, len, dst->len);
2691                 BUG_ON(1);
2692         }
2693         if (dst_offset < src_offset) {
2694                 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
2695                 return;
2696         }
2697         while(len > 0) {
2698                 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
2699                 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
2700
2701                 dst_off_in_page = dst_end &
2702                         ((unsigned long)PAGE_CACHE_SIZE - 1);
2703                 src_off_in_page = src_end &
2704                         ((unsigned long)PAGE_CACHE_SIZE - 1);
2705                 if (src_i == 0)
2706                         src_off_in_page += start_offset;
2707                 if (dst_i == 0)
2708                         dst_off_in_page += start_offset;
2709
2710                 cur = min_t(unsigned long, len, src_off_in_page + 1);
2711                 cur = min(cur, dst_off_in_page + 1);
2712                 move_pages(extent_buffer_page(dst, dst_i),
2713                            extent_buffer_page(dst, src_i),
2714                            dst_off_in_page - cur + 1,
2715                            src_off_in_page - cur + 1, cur);
2716
2717                 dst_end -= cur;
2718                 src_end -= cur;
2719                 len -= cur;
2720         }
2721 }
2722 EXPORT_SYMBOL(memmove_extent_buffer);