Btrfs: Create extent_buffer interface for large blocksizes
[safe/jmp/linux-2.6] / fs / btrfs / extent_map.c
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/gfp.h>
6 #include <linux/pagemap.h>
7 #include <linux/page-flags.h>
8 #include <linux/module.h>
9 #include <linux/spinlock.h>
10 #include <linux/blkdev.h>
11 #include "extent_map.h"
12
13 /* temporary define until extent_map moves out of btrfs */
14 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
15                                        unsigned long extra_flags,
16                                        void (*ctor)(void *, struct kmem_cache *,
17                                                     unsigned long));
18
19 static struct kmem_cache *extent_map_cache;
20 static struct kmem_cache *extent_state_cache;
21
22 struct tree_entry {
23         u64 start;
24         u64 end;
25         int in_tree;
26         struct rb_node rb_node;
27 };
28
29 void __init extent_map_init(void)
30 {
31         extent_map_cache = btrfs_cache_create("extent_map",
32                                             sizeof(struct extent_map),
33                                             SLAB_DESTROY_BY_RCU,
34                                             NULL);
35         extent_state_cache = btrfs_cache_create("extent_state",
36                                             sizeof(struct extent_state),
37                                             SLAB_DESTROY_BY_RCU,
38                                             NULL);
39 }
40
41 void __exit extent_map_exit(void)
42 {
43         if (extent_map_cache)
44                 kmem_cache_destroy(extent_map_cache);
45         if (extent_state_cache)
46                 kmem_cache_destroy(extent_state_cache);
47 }
48
49 void extent_map_tree_init(struct extent_map_tree *tree,
50                           struct address_space *mapping, gfp_t mask)
51 {
52         tree->map.rb_node = NULL;
53         tree->state.rb_node = NULL;
54         tree->ops = NULL;
55         rwlock_init(&tree->lock);
56         tree->mapping = mapping;
57 }
58 EXPORT_SYMBOL(extent_map_tree_init);
59
60 struct extent_map *alloc_extent_map(gfp_t mask)
61 {
62         struct extent_map *em;
63         em = kmem_cache_alloc(extent_map_cache, mask);
64         if (!em || IS_ERR(em))
65                 return em;
66         em->in_tree = 0;
67         atomic_set(&em->refs, 1);
68         return em;
69 }
70 EXPORT_SYMBOL(alloc_extent_map);
71
72 void free_extent_map(struct extent_map *em)
73 {
74         if (!em)
75                 return;
76         if (atomic_dec_and_test(&em->refs)) {
77                 WARN_ON(em->in_tree);
78                 kmem_cache_free(extent_map_cache, em);
79         }
80 }
81 EXPORT_SYMBOL(free_extent_map);
82
83
84 struct extent_state *alloc_extent_state(gfp_t mask)
85 {
86         struct extent_state *state;
87         state = kmem_cache_alloc(extent_state_cache, mask);
88         if (!state || IS_ERR(state))
89                 return state;
90         state->state = 0;
91         state->in_tree = 0;
92         state->private = 0;
93         atomic_set(&state->refs, 1);
94         init_waitqueue_head(&state->wq);
95         return state;
96 }
97 EXPORT_SYMBOL(alloc_extent_state);
98
99 void free_extent_state(struct extent_state *state)
100 {
101         if (!state)
102                 return;
103         if (atomic_dec_and_test(&state->refs)) {
104                 WARN_ON(state->in_tree);
105                 kmem_cache_free(extent_state_cache, state);
106         }
107 }
108 EXPORT_SYMBOL(free_extent_state);
109
110 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
111                                    struct rb_node *node)
112 {
113         struct rb_node ** p = &root->rb_node;
114         struct rb_node * parent = NULL;
115         struct tree_entry *entry;
116
117         while(*p) {
118                 parent = *p;
119                 entry = rb_entry(parent, struct tree_entry, rb_node);
120
121                 if (offset < entry->start)
122                         p = &(*p)->rb_left;
123                 else if (offset > entry->end)
124                         p = &(*p)->rb_right;
125                 else
126                         return parent;
127         }
128
129         entry = rb_entry(node, struct tree_entry, rb_node);
130         entry->in_tree = 1;
131         rb_link_node(node, parent, p);
132         rb_insert_color(node, root);
133         return NULL;
134 }
135
136 static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
137                                    struct rb_node **prev_ret)
138 {
139         struct rb_node * n = root->rb_node;
140         struct rb_node *prev = NULL;
141         struct tree_entry *entry;
142         struct tree_entry *prev_entry = NULL;
143
144         while(n) {
145                 entry = rb_entry(n, struct tree_entry, rb_node);
146                 prev = n;
147                 prev_entry = entry;
148
149                 if (offset < entry->start)
150                         n = n->rb_left;
151                 else if (offset > entry->end)
152                         n = n->rb_right;
153                 else
154                         return n;
155         }
156         if (!prev_ret)
157                 return NULL;
158         while(prev && offset > prev_entry->end) {
159                 prev = rb_next(prev);
160                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
161         }
162         *prev_ret = prev;
163         return NULL;
164 }
165
166 static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
167 {
168         struct rb_node *prev;
169         struct rb_node *ret;
170         ret = __tree_search(root, offset, &prev);
171         if (!ret)
172                 return prev;
173         return ret;
174 }
175
176 static int tree_delete(struct rb_root *root, u64 offset)
177 {
178         struct rb_node *node;
179         struct tree_entry *entry;
180
181         node = __tree_search(root, offset, NULL);
182         if (!node)
183                 return -ENOENT;
184         entry = rb_entry(node, struct tree_entry, rb_node);
185         entry->in_tree = 0;
186         rb_erase(node, root);
187         return 0;
188 }
189
190 /*
191  * add_extent_mapping tries a simple backward merge with existing
192  * mappings.  The extent_map struct passed in will be inserted into
193  * the tree directly (no copies made, just a reference taken).
194  */
195 int add_extent_mapping(struct extent_map_tree *tree,
196                        struct extent_map *em)
197 {
198         int ret = 0;
199         struct extent_map *prev = NULL;
200         struct rb_node *rb;
201
202         write_lock_irq(&tree->lock);
203         rb = tree_insert(&tree->map, em->end, &em->rb_node);
204         if (rb) {
205                 prev = rb_entry(rb, struct extent_map, rb_node);
206                 printk("found extent map %Lu %Lu on insert of %Lu %Lu\n", prev->start, prev->end, em->start, em->end);
207                 ret = -EEXIST;
208                 goto out;
209         }
210         atomic_inc(&em->refs);
211         if (em->start != 0) {
212                 rb = rb_prev(&em->rb_node);
213                 if (rb)
214                         prev = rb_entry(rb, struct extent_map, rb_node);
215                 if (prev && prev->end + 1 == em->start &&
216                     ((em->block_start == EXTENT_MAP_HOLE &&
217                       prev->block_start == EXTENT_MAP_HOLE) ||
218                              (em->block_start == prev->block_end + 1))) {
219                         em->start = prev->start;
220                         em->block_start = prev->block_start;
221                         rb_erase(&prev->rb_node, &tree->map);
222                         prev->in_tree = 0;
223                         free_extent_map(prev);
224                 }
225          }
226 out:
227         write_unlock_irq(&tree->lock);
228         return ret;
229 }
230 EXPORT_SYMBOL(add_extent_mapping);
231
232 /*
233  * lookup_extent_mapping returns the first extent_map struct in the
234  * tree that intersects the [start, end] (inclusive) range.  There may
235  * be additional objects in the tree that intersect, so check the object
236  * returned carefully to make sure you don't need additional lookups.
237  */
238 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
239                                          u64 start, u64 end)
240 {
241         struct extent_map *em;
242         struct rb_node *rb_node;
243
244         read_lock_irq(&tree->lock);
245         rb_node = tree_search(&tree->map, start);
246         if (!rb_node) {
247                 em = NULL;
248                 goto out;
249         }
250         if (IS_ERR(rb_node)) {
251                 em = ERR_PTR(PTR_ERR(rb_node));
252                 goto out;
253         }
254         em = rb_entry(rb_node, struct extent_map, rb_node);
255         if (em->end < start || em->start > end) {
256                 em = NULL;
257                 goto out;
258         }
259         atomic_inc(&em->refs);
260 out:
261         read_unlock_irq(&tree->lock);
262         return em;
263 }
264 EXPORT_SYMBOL(lookup_extent_mapping);
265
266 /*
267  * removes an extent_map struct from the tree.  No reference counts are
268  * dropped, and no checks are done to  see if the range is in use
269  */
270 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
271 {
272         int ret;
273
274         write_lock_irq(&tree->lock);
275         ret = tree_delete(&tree->map, em->end);
276         write_unlock_irq(&tree->lock);
277         return ret;
278 }
279 EXPORT_SYMBOL(remove_extent_mapping);
280
281 /*
282  * utility function to look for merge candidates inside a given range.
283  * Any extents with matching state are merged together into a single
284  * extent in the tree.  Extents with EXTENT_IO in their state field
285  * are not merged because the end_io handlers need to be able to do
286  * operations on them without sleeping (or doing allocations/splits).
287  *
288  * This should be called with the tree lock held.
289  */
290 static int merge_state(struct extent_map_tree *tree,
291                        struct extent_state *state)
292 {
293         struct extent_state *other;
294         struct rb_node *other_node;
295
296         if (state->state & EXTENT_IOBITS)
297                 return 0;
298
299         other_node = rb_prev(&state->rb_node);
300         if (other_node) {
301                 other = rb_entry(other_node, struct extent_state, rb_node);
302                 if (other->end == state->start - 1 &&
303                     other->state == state->state) {
304                         state->start = other->start;
305                         other->in_tree = 0;
306                         rb_erase(&other->rb_node, &tree->state);
307                         free_extent_state(other);
308                 }
309         }
310         other_node = rb_next(&state->rb_node);
311         if (other_node) {
312                 other = rb_entry(other_node, struct extent_state, rb_node);
313                 if (other->start == state->end + 1 &&
314                     other->state == state->state) {
315                         other->start = state->start;
316                         state->in_tree = 0;
317                         rb_erase(&state->rb_node, &tree->state);
318                         free_extent_state(state);
319                 }
320         }
321         return 0;
322 }
323
324 /*
325  * insert an extent_state struct into the tree.  'bits' are set on the
326  * struct before it is inserted.
327  *
328  * This may return -EEXIST if the extent is already there, in which case the
329  * state struct is freed.
330  *
331  * The tree lock is not taken internally.  This is a utility function and
332  * probably isn't what you want to call (see set/clear_extent_bit).
333  */
334 static int insert_state(struct extent_map_tree *tree,
335                         struct extent_state *state, u64 start, u64 end,
336                         int bits)
337 {
338         struct rb_node *node;
339
340         if (end < start) {
341                 printk("end < start %Lu %Lu\n", end, start);
342                 WARN_ON(1);
343         }
344         state->state |= bits;
345         state->start = start;
346         state->end = end;
347         if ((end & 4095) == 0) {
348                 printk("insert state %Lu %Lu strange end\n", start, end);
349                 WARN_ON(1);
350         }
351         node = tree_insert(&tree->state, end, &state->rb_node);
352         if (node) {
353                 struct extent_state *found;
354                 found = rb_entry(node, struct extent_state, rb_node);
355                 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
356                 free_extent_state(state);
357                 return -EEXIST;
358         }
359         merge_state(tree, state);
360         return 0;
361 }
362
363 /*
364  * split a given extent state struct in two, inserting the preallocated
365  * struct 'prealloc' as the newly created second half.  'split' indicates an
366  * offset inside 'orig' where it should be split.
367  *
368  * Before calling,
369  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
370  * are two extent state structs in the tree:
371  * prealloc: [orig->start, split - 1]
372  * orig: [ split, orig->end ]
373  *
374  * The tree locks are not taken by this function. They need to be held
375  * by the caller.
376  */
377 static int split_state(struct extent_map_tree *tree, struct extent_state *orig,
378                        struct extent_state *prealloc, u64 split)
379 {
380         struct rb_node *node;
381         prealloc->start = orig->start;
382         prealloc->end = split - 1;
383         prealloc->state = orig->state;
384         orig->start = split;
385         if ((prealloc->end & 4095) == 0) {
386                 printk("insert state %Lu %Lu strange end\n", prealloc->start,
387                        prealloc->end);
388                 WARN_ON(1);
389         }
390         node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
391         if (node) {
392                 struct extent_state *found;
393                 found = rb_entry(node, struct extent_state, rb_node);
394                 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
395                 free_extent_state(prealloc);
396                 return -EEXIST;
397         }
398         return 0;
399 }
400
401 /*
402  * utility function to clear some bits in an extent state struct.
403  * it will optionally wake up any one waiting on this state (wake == 1), or
404  * forcibly remove the state from the tree (delete == 1).
405  *
406  * If no bits are set on the state struct after clearing things, the
407  * struct is freed and removed from the tree
408  */
409 static int clear_state_bit(struct extent_map_tree *tree,
410                             struct extent_state *state, int bits, int wake,
411                             int delete)
412 {
413         int ret = state->state & bits;
414         state->state &= ~bits;
415         if (wake)
416                 wake_up(&state->wq);
417         if (delete || state->state == 0) {
418                 if (state->in_tree) {
419                         rb_erase(&state->rb_node, &tree->state);
420                         state->in_tree = 0;
421                         free_extent_state(state);
422                 } else {
423                         WARN_ON(1);
424                 }
425         } else {
426                 merge_state(tree, state);
427         }
428         return ret;
429 }
430
431 /*
432  * clear some bits on a range in the tree.  This may require splitting
433  * or inserting elements in the tree, so the gfp mask is used to
434  * indicate which allocations or sleeping are allowed.
435  *
436  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
437  * the given range from the tree regardless of state (ie for truncate).
438  *
439  * the range [start, end] is inclusive.
440  *
441  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
442  * bits were already set, or zero if none of the bits were already set.
443  */
444 int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end,
445                      int bits, int wake, int delete, gfp_t mask)
446 {
447         struct extent_state *state;
448         struct extent_state *prealloc = NULL;
449         struct rb_node *node;
450         unsigned long flags;
451         int err;
452         int set = 0;
453
454 again:
455         if (!prealloc && (mask & __GFP_WAIT)) {
456                 prealloc = alloc_extent_state(mask);
457                 if (!prealloc)
458                         return -ENOMEM;
459         }
460
461         write_lock_irqsave(&tree->lock, flags);
462         /*
463          * this search will find the extents that end after
464          * our range starts
465          */
466         node = tree_search(&tree->state, start);
467         if (!node)
468                 goto out;
469         state = rb_entry(node, struct extent_state, rb_node);
470         if (state->start > end)
471                 goto out;
472         WARN_ON(state->end < start);
473
474         /*
475          *     | ---- desired range ---- |
476          *  | state | or
477          *  | ------------- state -------------- |
478          *
479          * We need to split the extent we found, and may flip
480          * bits on second half.
481          *
482          * If the extent we found extends past our range, we
483          * just split and search again.  It'll get split again
484          * the next time though.
485          *
486          * If the extent we found is inside our range, we clear
487          * the desired bit on it.
488          */
489
490         if (state->start < start) {
491                 err = split_state(tree, state, prealloc, start);
492                 BUG_ON(err == -EEXIST);
493                 prealloc = NULL;
494                 if (err)
495                         goto out;
496                 if (state->end <= end) {
497                         start = state->end + 1;
498                         set |= clear_state_bit(tree, state, bits,
499                                         wake, delete);
500                 } else {
501                         start = state->start;
502                 }
503                 goto search_again;
504         }
505         /*
506          * | ---- desired range ---- |
507          *                        | state |
508          * We need to split the extent, and clear the bit
509          * on the first half
510          */
511         if (state->start <= end && state->end > end) {
512                 err = split_state(tree, state, prealloc, end + 1);
513                 BUG_ON(err == -EEXIST);
514
515                 if (wake)
516                         wake_up(&state->wq);
517                 set |= clear_state_bit(tree, prealloc, bits,
518                                        wake, delete);
519                 prealloc = NULL;
520                 goto out;
521         }
522
523         start = state->end + 1;
524         set |= clear_state_bit(tree, state, bits, wake, delete);
525         goto search_again;
526
527 out:
528         write_unlock_irqrestore(&tree->lock, flags);
529         if (prealloc)
530                 free_extent_state(prealloc);
531
532         return set;
533
534 search_again:
535         if (start >= end)
536                 goto out;
537         write_unlock_irqrestore(&tree->lock, flags);
538         if (mask & __GFP_WAIT)
539                 cond_resched();
540         goto again;
541 }
542 EXPORT_SYMBOL(clear_extent_bit);
543
544 static int wait_on_state(struct extent_map_tree *tree,
545                          struct extent_state *state)
546 {
547         DEFINE_WAIT(wait);
548         prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
549         read_unlock_irq(&tree->lock);
550         schedule();
551         read_lock_irq(&tree->lock);
552         finish_wait(&state->wq, &wait);
553         return 0;
554 }
555
556 /*
557  * waits for one or more bits to clear on a range in the state tree.
558  * The range [start, end] is inclusive.
559  * The tree lock is taken by this function
560  */
561 int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits)
562 {
563         struct extent_state *state;
564         struct rb_node *node;
565
566         read_lock_irq(&tree->lock);
567 again:
568         while (1) {
569                 /*
570                  * this search will find all the extents that end after
571                  * our range starts
572                  */
573                 node = tree_search(&tree->state, start);
574                 if (!node)
575                         break;
576
577                 state = rb_entry(node, struct extent_state, rb_node);
578
579                 if (state->start > end)
580                         goto out;
581
582                 if (state->state & bits) {
583                         start = state->start;
584                         atomic_inc(&state->refs);
585                         wait_on_state(tree, state);
586                         free_extent_state(state);
587                         goto again;
588                 }
589                 start = state->end + 1;
590
591                 if (start > end)
592                         break;
593
594                 if (need_resched()) {
595                         read_unlock_irq(&tree->lock);
596                         cond_resched();
597                         read_lock_irq(&tree->lock);
598                 }
599         }
600 out:
601         read_unlock_irq(&tree->lock);
602         return 0;
603 }
604 EXPORT_SYMBOL(wait_extent_bit);
605
606 /*
607  * set some bits on a range in the tree.  This may require allocations
608  * or sleeping, so the gfp mask is used to indicate what is allowed.
609  *
610  * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
611  * range already has the desired bits set.  The start of the existing
612  * range is returned in failed_start in this case.
613  *
614  * [start, end] is inclusive
615  * This takes the tree lock.
616  */
617 int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits,
618                    int exclusive, u64 *failed_start, gfp_t mask)
619 {
620         struct extent_state *state;
621         struct extent_state *prealloc = NULL;
622         struct rb_node *node;
623         unsigned long flags;
624         int err = 0;
625         int set;
626         u64 last_start;
627         u64 last_end;
628 again:
629         if (!prealloc && (mask & __GFP_WAIT)) {
630                 prealloc = alloc_extent_state(mask);
631                 if (!prealloc)
632                         return -ENOMEM;
633         }
634
635         write_lock_irqsave(&tree->lock, flags);
636         /*
637          * this search will find all the extents that end after
638          * our range starts.
639          */
640         node = tree_search(&tree->state, start);
641         if (!node) {
642                 err = insert_state(tree, prealloc, start, end, bits);
643                 prealloc = NULL;
644                 BUG_ON(err == -EEXIST);
645                 goto out;
646         }
647
648         state = rb_entry(node, struct extent_state, rb_node);
649         last_start = state->start;
650         last_end = state->end;
651
652         /*
653          * | ---- desired range ---- |
654          * | state |
655          *
656          * Just lock what we found and keep going
657          */
658         if (state->start == start && state->end <= end) {
659                 set = state->state & bits;
660                 if (set && exclusive) {
661                         *failed_start = state->start;
662                         err = -EEXIST;
663                         goto out;
664                 }
665                 state->state |= bits;
666                 start = state->end + 1;
667                 merge_state(tree, state);
668                 goto search_again;
669         }
670
671         /*
672          *     | ---- desired range ---- |
673          * | state |
674          *   or
675          * | ------------- state -------------- |
676          *
677          * We need to split the extent we found, and may flip bits on
678          * second half.
679          *
680          * If the extent we found extends past our
681          * range, we just split and search again.  It'll get split
682          * again the next time though.
683          *
684          * If the extent we found is inside our range, we set the
685          * desired bit on it.
686          */
687         if (state->start < start) {
688                 set = state->state & bits;
689                 if (exclusive && set) {
690                         *failed_start = start;
691                         err = -EEXIST;
692                         goto out;
693                 }
694                 err = split_state(tree, state, prealloc, start);
695                 BUG_ON(err == -EEXIST);
696                 prealloc = NULL;
697                 if (err)
698                         goto out;
699                 if (state->end <= end) {
700                         state->state |= bits;
701                         start = state->end + 1;
702                         merge_state(tree, state);
703                 } else {
704                         start = state->start;
705                 }
706                 goto search_again;
707         }
708         /*
709          * | ---- desired range ---- |
710          *     | state | or               | state |
711          *
712          * There's a hole, we need to insert something in it and
713          * ignore the extent we found.
714          */
715         if (state->start > start) {
716                 u64 this_end;
717                 if (end < last_start)
718                         this_end = end;
719                 else
720                         this_end = last_start -1;
721                 err = insert_state(tree, prealloc, start, this_end,
722                                    bits);
723                 prealloc = NULL;
724                 BUG_ON(err == -EEXIST);
725                 if (err)
726                         goto out;
727                 start = this_end + 1;
728                 goto search_again;
729         }
730         /*
731          * | ---- desired range ---- |
732          *                        | state |
733          * We need to split the extent, and set the bit
734          * on the first half
735          */
736         if (state->start <= end && state->end > end) {
737                 set = state->state & bits;
738                 if (exclusive && set) {
739                         *failed_start = start;
740                         err = -EEXIST;
741                         goto out;
742                 }
743                 err = split_state(tree, state, prealloc, end + 1);
744                 BUG_ON(err == -EEXIST);
745
746                 prealloc->state |= bits;
747                 merge_state(tree, prealloc);
748                 prealloc = NULL;
749                 goto out;
750         }
751
752         goto search_again;
753
754 out:
755         write_unlock_irqrestore(&tree->lock, flags);
756         if (prealloc)
757                 free_extent_state(prealloc);
758
759         return err;
760
761 search_again:
762         if (start > end)
763                 goto out;
764         write_unlock_irqrestore(&tree->lock, flags);
765         if (mask & __GFP_WAIT)
766                 cond_resched();
767         goto again;
768 }
769 EXPORT_SYMBOL(set_extent_bit);
770
771 /* wrappers around set/clear extent bit */
772 int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
773                      gfp_t mask)
774 {
775         return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
776                               mask);
777 }
778 EXPORT_SYMBOL(set_extent_dirty);
779
780 int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end,
781                      gfp_t mask)
782 {
783         return set_extent_bit(tree, start, end,
784                               EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
785                               mask);
786 }
787 EXPORT_SYMBOL(set_extent_delalloc);
788
789 int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
790                        gfp_t mask)
791 {
792         return clear_extent_bit(tree, start, end,
793                                 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
794 }
795 EXPORT_SYMBOL(clear_extent_dirty);
796
797 int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
798                      gfp_t mask)
799 {
800         return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
801                               mask);
802 }
803 EXPORT_SYMBOL(set_extent_new);
804
805 int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
806                        gfp_t mask)
807 {
808         return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
809 }
810 EXPORT_SYMBOL(clear_extent_new);
811
812 int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
813                         gfp_t mask)
814 {
815         return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
816                               mask);
817 }
818 EXPORT_SYMBOL(set_extent_uptodate);
819
820 int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
821                           gfp_t mask)
822 {
823         return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
824 }
825 EXPORT_SYMBOL(clear_extent_uptodate);
826
827 int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
828                          gfp_t mask)
829 {
830         return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
831                               0, NULL, mask);
832 }
833 EXPORT_SYMBOL(set_extent_writeback);
834
835 int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
836                            gfp_t mask)
837 {
838         return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
839 }
840 EXPORT_SYMBOL(clear_extent_writeback);
841
842 int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end)
843 {
844         return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
845 }
846 EXPORT_SYMBOL(wait_on_extent_writeback);
847
848 /*
849  * locks a range in ascending order, waiting for any locked regions
850  * it hits on the way.  [start,end] are inclusive, and this will sleep.
851  */
852 int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask)
853 {
854         int err;
855         u64 failed_start;
856         while (1) {
857                 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
858                                      &failed_start, mask);
859                 if (err == -EEXIST && (mask & __GFP_WAIT)) {
860                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
861                         start = failed_start;
862                 } else {
863                         break;
864                 }
865                 WARN_ON(start > end);
866         }
867         return err;
868 }
869 EXPORT_SYMBOL(lock_extent);
870
871 int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end,
872                   gfp_t mask)
873 {
874         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
875 }
876 EXPORT_SYMBOL(unlock_extent);
877
878 /*
879  * helper function to set pages and extents in the tree dirty
880  */
881 int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end)
882 {
883         unsigned long index = start >> PAGE_CACHE_SHIFT;
884         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
885         struct page *page;
886
887         while (index <= end_index) {
888                 page = find_get_page(tree->mapping, index);
889                 BUG_ON(!page);
890                 __set_page_dirty_nobuffers(page);
891                 page_cache_release(page);
892                 index++;
893         }
894         set_extent_dirty(tree, start, end, GFP_NOFS);
895         return 0;
896 }
897 EXPORT_SYMBOL(set_range_dirty);
898
899 /*
900  * helper function to set both pages and extents in the tree writeback
901  */
902 int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end)
903 {
904         unsigned long index = start >> PAGE_CACHE_SHIFT;
905         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
906         struct page *page;
907
908         while (index <= end_index) {
909                 page = find_get_page(tree->mapping, index);
910                 BUG_ON(!page);
911                 set_page_writeback(page);
912                 page_cache_release(page);
913                 index++;
914         }
915         set_extent_writeback(tree, start, end, GFP_NOFS);
916         return 0;
917 }
918 EXPORT_SYMBOL(set_range_writeback);
919
920 int find_first_extent_bit(struct extent_map_tree *tree, u64 start,
921                           u64 *start_ret, u64 *end_ret, int bits)
922 {
923         struct rb_node *node;
924         struct extent_state *state;
925         int ret = 1;
926
927         write_lock_irq(&tree->lock);
928         /*
929          * this search will find all the extents that end after
930          * our range starts.
931          */
932         node = tree_search(&tree->state, start);
933         if (!node || IS_ERR(node)) {
934                 goto out;
935         }
936
937         while(1) {
938                 state = rb_entry(node, struct extent_state, rb_node);
939                 if (state->state & bits) {
940                         *start_ret = state->start;
941                         *end_ret = state->end;
942                         ret = 0;
943                 }
944                 node = rb_next(node);
945                 if (!node)
946                         break;
947         }
948 out:
949         write_unlock_irq(&tree->lock);
950         return ret;
951 }
952 EXPORT_SYMBOL(find_first_extent_bit);
953
954 u64 find_lock_delalloc_range(struct extent_map_tree *tree,
955                              u64 start, u64 lock_start, u64 *end, u64 max_bytes)
956 {
957         struct rb_node *node;
958         struct extent_state *state;
959         u64 cur_start = start;
960         u64 found = 0;
961         u64 total_bytes = 0;
962
963         write_lock_irq(&tree->lock);
964         /*
965          * this search will find all the extents that end after
966          * our range starts.
967          */
968 search_again:
969         node = tree_search(&tree->state, cur_start);
970         if (!node || IS_ERR(node)) {
971                 goto out;
972         }
973
974         while(1) {
975                 state = rb_entry(node, struct extent_state, rb_node);
976                 if (state->start != cur_start) {
977                         goto out;
978                 }
979                 if (!(state->state & EXTENT_DELALLOC)) {
980                         goto out;
981                 }
982                 if (state->start >= lock_start) {
983                         if (state->state & EXTENT_LOCKED) {
984                                 DEFINE_WAIT(wait);
985                                 atomic_inc(&state->refs);
986                                 write_unlock_irq(&tree->lock);
987                                 schedule();
988                                 write_lock_irq(&tree->lock);
989                                 finish_wait(&state->wq, &wait);
990                                 free_extent_state(state);
991                                 goto search_again;
992                         }
993                         state->state |= EXTENT_LOCKED;
994                 }
995                 found++;
996                 *end = state->end;
997                 cur_start = state->end + 1;
998                 node = rb_next(node);
999                 if (!node)
1000                         break;
1001                 total_bytes = state->end - state->start + 1;
1002                 if (total_bytes >= max_bytes)
1003                         break;
1004         }
1005 out:
1006         write_unlock_irq(&tree->lock);
1007         return found;
1008 }
1009
1010 /*
1011  * helper function to lock both pages and extents in the tree.
1012  * pages must be locked first.
1013  */
1014 int lock_range(struct extent_map_tree *tree, u64 start, u64 end)
1015 {
1016         unsigned long index = start >> PAGE_CACHE_SHIFT;
1017         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1018         struct page *page;
1019         int err;
1020
1021         while (index <= end_index) {
1022                 page = grab_cache_page(tree->mapping, index);
1023                 if (!page) {
1024                         err = -ENOMEM;
1025                         goto failed;
1026                 }
1027                 if (IS_ERR(page)) {
1028                         err = PTR_ERR(page);
1029                         goto failed;
1030                 }
1031                 index++;
1032         }
1033         lock_extent(tree, start, end, GFP_NOFS);
1034         return 0;
1035
1036 failed:
1037         /*
1038          * we failed above in getting the page at 'index', so we undo here
1039          * up to but not including the page at 'index'
1040          */
1041         end_index = index;
1042         index = start >> PAGE_CACHE_SHIFT;
1043         while (index < end_index) {
1044                 page = find_get_page(tree->mapping, index);
1045                 unlock_page(page);
1046                 page_cache_release(page);
1047                 index++;
1048         }
1049         return err;
1050 }
1051 EXPORT_SYMBOL(lock_range);
1052
1053 /*
1054  * helper function to unlock both pages and extents in the tree.
1055  */
1056 int unlock_range(struct extent_map_tree *tree, u64 start, u64 end)
1057 {
1058         unsigned long index = start >> PAGE_CACHE_SHIFT;
1059         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1060         struct page *page;
1061
1062         while (index <= end_index) {
1063                 page = find_get_page(tree->mapping, index);
1064                 unlock_page(page);
1065                 page_cache_release(page);
1066                 index++;
1067         }
1068         unlock_extent(tree, start, end, GFP_NOFS);
1069         return 0;
1070 }
1071 EXPORT_SYMBOL(unlock_range);
1072
1073 int set_state_private(struct extent_map_tree *tree, u64 start, u64 private)
1074 {
1075         struct rb_node *node;
1076         struct extent_state *state;
1077         int ret = 0;
1078
1079         write_lock_irq(&tree->lock);
1080         /*
1081          * this search will find all the extents that end after
1082          * our range starts.
1083          */
1084         node = tree_search(&tree->state, start);
1085         if (!node || IS_ERR(node)) {
1086                 ret = -ENOENT;
1087                 goto out;
1088         }
1089         state = rb_entry(node, struct extent_state, rb_node);
1090         if (state->start != start) {
1091                 ret = -ENOENT;
1092                 goto out;
1093         }
1094         state->private = private;
1095 out:
1096         write_unlock_irq(&tree->lock);
1097         return ret;
1098
1099 }
1100
1101 int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private)
1102 {
1103         struct rb_node *node;
1104         struct extent_state *state;
1105         int ret = 0;
1106
1107         read_lock_irq(&tree->lock);
1108         /*
1109          * this search will find all the extents that end after
1110          * our range starts.
1111          */
1112         node = tree_search(&tree->state, start);
1113         if (!node || IS_ERR(node)) {
1114                 ret = -ENOENT;
1115                 goto out;
1116         }
1117         state = rb_entry(node, struct extent_state, rb_node);
1118         if (state->start != start) {
1119                 ret = -ENOENT;
1120                 goto out;
1121         }
1122         *private = state->private;
1123 out:
1124         read_unlock_irq(&tree->lock);
1125         return ret;
1126 }
1127
1128 /*
1129  * searches a range in the state tree for a given mask.
1130  * If 'filled' == 1, this returns 1 only if ever extent in the tree
1131  * has the bits set.  Otherwise, 1 is returned if any bit in the
1132  * range is found set.
1133  */
1134 static int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
1135                           int bits, int filled)
1136 {
1137         struct extent_state *state = NULL;
1138         struct rb_node *node;
1139         int bitset = 0;
1140
1141         read_lock_irq(&tree->lock);
1142         node = tree_search(&tree->state, start);
1143         while (node && start <= end) {
1144                 state = rb_entry(node, struct extent_state, rb_node);
1145                 if (state->start > end)
1146                         break;
1147
1148                 if (filled && state->start > start) {
1149                         bitset = 0;
1150                         break;
1151                 }
1152                 if (state->state & bits) {
1153                         bitset = 1;
1154                         if (!filled)
1155                                 break;
1156                 } else if (filled) {
1157                         bitset = 0;
1158                         break;
1159                 }
1160                 start = state->end + 1;
1161                 if (start > end)
1162                         break;
1163                 node = rb_next(node);
1164         }
1165         read_unlock_irq(&tree->lock);
1166         return bitset;
1167 }
1168
1169 /*
1170  * helper function to set a given page up to date if all the
1171  * extents in the tree for that page are up to date
1172  */
1173 static int check_page_uptodate(struct extent_map_tree *tree,
1174                                struct page *page)
1175 {
1176         u64 start = page->index << PAGE_CACHE_SHIFT;
1177         u64 end = start + PAGE_CACHE_SIZE - 1;
1178         if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1179                 SetPageUptodate(page);
1180         return 0;
1181 }
1182
1183 /*
1184  * helper function to unlock a page if all the extents in the tree
1185  * for that page are unlocked
1186  */
1187 static int check_page_locked(struct extent_map_tree *tree,
1188                              struct page *page)
1189 {
1190         u64 start = page->index << PAGE_CACHE_SHIFT;
1191         u64 end = start + PAGE_CACHE_SIZE - 1;
1192         if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1193                 unlock_page(page);
1194         return 0;
1195 }
1196
1197 /*
1198  * helper function to end page writeback if all the extents
1199  * in the tree for that page are done with writeback
1200  */
1201 static int check_page_writeback(struct extent_map_tree *tree,
1202                              struct page *page)
1203 {
1204         u64 start = page->index << PAGE_CACHE_SHIFT;
1205         u64 end = start + PAGE_CACHE_SIZE - 1;
1206         if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1207                 end_page_writeback(page);
1208         return 0;
1209 }
1210
1211 /* lots and lots of room for performance fixes in the end_bio funcs */
1212
1213 /*
1214  * after a writepage IO is done, we need to:
1215  * clear the uptodate bits on error
1216  * clear the writeback bits in the extent tree for this IO
1217  * end_page_writeback if the page has no more pending IO
1218  *
1219  * Scheduling is not allowed, so the extent state tree is expected
1220  * to have one and only one object corresponding to this IO.
1221  */
1222 static int end_bio_extent_writepage(struct bio *bio,
1223                                    unsigned int bytes_done, int err)
1224 {
1225         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1226         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1227         struct extent_map_tree *tree = bio->bi_private;
1228         u64 start;
1229         u64 end;
1230         int whole_page;
1231
1232         if (bio->bi_size)
1233                 return 1;
1234
1235         do {
1236                 struct page *page = bvec->bv_page;
1237                 start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1238                 end = start + bvec->bv_len - 1;
1239
1240                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1241                         whole_page = 1;
1242                 else
1243                         whole_page = 0;
1244
1245                 if (--bvec >= bio->bi_io_vec)
1246                         prefetchw(&bvec->bv_page->flags);
1247
1248                 if (!uptodate) {
1249                         clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1250                         ClearPageUptodate(page);
1251                         SetPageError(page);
1252                 }
1253                 clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1254
1255                 if (whole_page)
1256                         end_page_writeback(page);
1257                 else
1258                         check_page_writeback(tree, page);
1259                 if (tree->ops && tree->ops->writepage_end_io_hook)
1260                         tree->ops->writepage_end_io_hook(page, start, end);
1261         } while (bvec >= bio->bi_io_vec);
1262
1263         bio_put(bio);
1264         return 0;
1265 }
1266
1267 /*
1268  * after a readpage IO is done, we need to:
1269  * clear the uptodate bits on error
1270  * set the uptodate bits if things worked
1271  * set the page up to date if all extents in the tree are uptodate
1272  * clear the lock bit in the extent tree
1273  * unlock the page if there are no other extents locked for it
1274  *
1275  * Scheduling is not allowed, so the extent state tree is expected
1276  * to have one and only one object corresponding to this IO.
1277  */
1278 static int end_bio_extent_readpage(struct bio *bio,
1279                                    unsigned int bytes_done, int err)
1280 {
1281         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1282         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1283         struct extent_map_tree *tree = bio->bi_private;
1284         u64 start;
1285         u64 end;
1286         int whole_page;
1287         int ret;
1288
1289         if (bio->bi_size)
1290                 return 1;
1291
1292         do {
1293                 struct page *page = bvec->bv_page;
1294                 start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1295                 end = start + bvec->bv_len - 1;
1296
1297                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1298                         whole_page = 1;
1299                 else
1300                         whole_page = 0;
1301
1302                 if (--bvec >= bio->bi_io_vec)
1303                         prefetchw(&bvec->bv_page->flags);
1304
1305                 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1306                         ret = tree->ops->readpage_end_io_hook(page, start, end);
1307                         if (ret)
1308                                 uptodate = 0;
1309                 }
1310                 if (uptodate) {
1311                         set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1312                         if (whole_page)
1313                                 SetPageUptodate(page);
1314                         else
1315                                 check_page_uptodate(tree, page);
1316                 } else {
1317                         ClearPageUptodate(page);
1318                         SetPageError(page);
1319                 }
1320
1321                 unlock_extent(tree, start, end, GFP_ATOMIC);
1322
1323                 if (whole_page)
1324                         unlock_page(page);
1325                 else
1326                         check_page_locked(tree, page);
1327         } while (bvec >= bio->bi_io_vec);
1328
1329         bio_put(bio);
1330         return 0;
1331 }
1332
1333 /*
1334  * IO done from prepare_write is pretty simple, we just unlock
1335  * the structs in the extent tree when done, and set the uptodate bits
1336  * as appropriate.
1337  */
1338 static int end_bio_extent_preparewrite(struct bio *bio,
1339                                        unsigned int bytes_done, int err)
1340 {
1341         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1342         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1343         struct extent_map_tree *tree = bio->bi_private;
1344         u64 start;
1345         u64 end;
1346
1347         if (bio->bi_size)
1348                 return 1;
1349
1350         do {
1351                 struct page *page = bvec->bv_page;
1352                 start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1353                 end = start + bvec->bv_len - 1;
1354
1355                 if (--bvec >= bio->bi_io_vec)
1356                         prefetchw(&bvec->bv_page->flags);
1357
1358                 if (uptodate) {
1359                         set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1360                 } else {
1361                         ClearPageUptodate(page);
1362                         SetPageError(page);
1363                 }
1364
1365                 unlock_extent(tree, start, end, GFP_ATOMIC);
1366
1367         } while (bvec >= bio->bi_io_vec);
1368
1369         bio_put(bio);
1370         return 0;
1371 }
1372
1373 static int submit_extent_page(int rw, struct extent_map_tree *tree,
1374                               struct page *page, sector_t sector,
1375                               size_t size, unsigned long offset,
1376                               struct block_device *bdev,
1377                               bio_end_io_t end_io_func)
1378 {
1379         struct bio *bio;
1380         int ret = 0;
1381
1382         bio = bio_alloc(GFP_NOIO, 1);
1383
1384         bio->bi_sector = sector;
1385         bio->bi_bdev = bdev;
1386         bio->bi_io_vec[0].bv_page = page;
1387         bio->bi_io_vec[0].bv_len = size;
1388         bio->bi_io_vec[0].bv_offset = offset;
1389
1390         bio->bi_vcnt = 1;
1391         bio->bi_idx = 0;
1392         bio->bi_size = size;
1393
1394         bio->bi_end_io = end_io_func;
1395         bio->bi_private = tree;
1396
1397         bio_get(bio);
1398         submit_bio(rw, bio);
1399
1400         if (bio_flagged(bio, BIO_EOPNOTSUPP))
1401                 ret = -EOPNOTSUPP;
1402
1403         bio_put(bio);
1404         return ret;
1405 }
1406
1407 void set_page_extent_mapped(struct page *page)
1408 {
1409         if (!PagePrivate(page)) {
1410                 SetPagePrivate(page);
1411                 WARN_ON(!page->mapping->a_ops->invalidatepage);
1412                 set_page_private(page, 1);
1413                 page_cache_get(page);
1414         }
1415 }
1416
1417 /*
1418  * basic readpage implementation.  Locked extent state structs are inserted
1419  * into the tree that are removed when the IO is done (by the end_io
1420  * handlers)
1421  */
1422 int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
1423                           get_extent_t *get_extent)
1424 {
1425         struct inode *inode = page->mapping->host;
1426         u64 start = page->index << PAGE_CACHE_SHIFT;
1427         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1428         u64 end;
1429         u64 cur = start;
1430         u64 extent_offset;
1431         u64 last_byte = i_size_read(inode);
1432         u64 block_start;
1433         u64 cur_end;
1434         sector_t sector;
1435         struct extent_map *em;
1436         struct block_device *bdev;
1437         int ret;
1438         int nr = 0;
1439         size_t page_offset = 0;
1440         size_t iosize;
1441         size_t blocksize = inode->i_sb->s_blocksize;
1442
1443         set_page_extent_mapped(page);
1444
1445         end = page_end;
1446         lock_extent(tree, start, end, GFP_NOFS);
1447
1448         while (cur <= end) {
1449                 if (cur >= last_byte) {
1450                         iosize = PAGE_CACHE_SIZE - page_offset;
1451                         zero_user_page(page, page_offset, iosize, KM_USER0);
1452                         set_extent_uptodate(tree, cur, cur + iosize - 1,
1453                                             GFP_NOFS);
1454                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1455                         break;
1456                 }
1457                 em = get_extent(inode, page, page_offset, cur, end, 0);
1458                 if (IS_ERR(em) || !em) {
1459                         SetPageError(page);
1460                         unlock_extent(tree, cur, end, GFP_NOFS);
1461                         break;
1462                 }
1463
1464                 extent_offset = cur - em->start;
1465                 BUG_ON(em->end < cur);
1466                 BUG_ON(end < cur);
1467
1468                 iosize = min(em->end - cur, end - cur) + 1;
1469                 cur_end = min(em->end, end);
1470                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1471                 sector = (em->block_start + extent_offset) >> 9;
1472                 bdev = em->bdev;
1473                 block_start = em->block_start;
1474                 free_extent_map(em);
1475                 em = NULL;
1476
1477                 /* we've found a hole, just zero and go on */
1478                 if (block_start == EXTENT_MAP_HOLE) {
1479                         zero_user_page(page, page_offset, iosize, KM_USER0);
1480                         set_extent_uptodate(tree, cur, cur + iosize - 1,
1481                                             GFP_NOFS);
1482                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1483                         cur = cur + iosize;
1484                         page_offset += iosize;
1485                         continue;
1486                 }
1487                 /* the get_extent function already copied into the page */
1488                 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
1489                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1490                         cur = cur + iosize;
1491                         page_offset += iosize;
1492                         continue;
1493                 }
1494
1495                 ret = 0;
1496                 if (tree->ops && tree->ops->readpage_io_hook) {
1497                         ret = tree->ops->readpage_io_hook(page, cur,
1498                                                           cur + iosize - 1);
1499                 }
1500                 if (!ret) {
1501                         ret = submit_extent_page(READ, tree, page,
1502                                                  sector, iosize, page_offset,
1503                                                  bdev, end_bio_extent_readpage);
1504                 }
1505                 if (ret)
1506                         SetPageError(page);
1507                 cur = cur + iosize;
1508                 page_offset += iosize;
1509                 nr++;
1510         }
1511         if (!nr) {
1512                 if (!PageError(page))
1513                         SetPageUptodate(page);
1514                 unlock_page(page);
1515         }
1516         return 0;
1517 }
1518 EXPORT_SYMBOL(extent_read_full_page);
1519
1520 /*
1521  * the writepage semantics are similar to regular writepage.  extent
1522  * records are inserted to lock ranges in the tree, and as dirty areas
1523  * are found, they are marked writeback.  Then the lock bits are removed
1524  * and the end_io handler clears the writeback ranges
1525  */
1526 int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
1527                           get_extent_t *get_extent,
1528                           struct writeback_control *wbc)
1529 {
1530         struct inode *inode = page->mapping->host;
1531         u64 start = page->index << PAGE_CACHE_SHIFT;
1532         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1533         u64 end;
1534         u64 cur = start;
1535         u64 extent_offset;
1536         u64 last_byte = i_size_read(inode);
1537         u64 block_start;
1538         sector_t sector;
1539         struct extent_map *em;
1540         struct block_device *bdev;
1541         int ret;
1542         int nr = 0;
1543         size_t page_offset = 0;
1544         size_t iosize;
1545         size_t blocksize;
1546         loff_t i_size = i_size_read(inode);
1547         unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
1548         u64 nr_delalloc;
1549         u64 delalloc_end;
1550
1551         WARN_ON(!PageLocked(page));
1552         if (page->index > end_index) {
1553                 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1554                 unlock_page(page);
1555                 return 0;
1556         }
1557
1558         if (page->index == end_index) {
1559                 size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
1560                 zero_user_page(page, offset,
1561                                PAGE_CACHE_SIZE - offset, KM_USER0);
1562         }
1563
1564         set_page_extent_mapped(page);
1565
1566         lock_extent(tree, start, page_end, GFP_NOFS);
1567         nr_delalloc = find_lock_delalloc_range(tree, start, page_end + 1,
1568                                                &delalloc_end,
1569                                                128 * 1024 * 1024);
1570         if (nr_delalloc) {
1571                 tree->ops->fill_delalloc(inode, start, delalloc_end);
1572                 if (delalloc_end >= page_end + 1) {
1573                         clear_extent_bit(tree, page_end + 1, delalloc_end,
1574                                          EXTENT_LOCKED | EXTENT_DELALLOC,
1575                                          1, 0, GFP_NOFS);
1576                 }
1577                 clear_extent_bit(tree, start, page_end, EXTENT_DELALLOC,
1578                                  0, 0, GFP_NOFS);
1579                 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1580                         printk("found delalloc bits after clear extent_bit\n");
1581                 }
1582         } else if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1583                 printk("found delalloc bits after find_delalloc_range returns 0\n");
1584         }
1585
1586         end = page_end;
1587         if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1588                 printk("found delalloc bits after lock_extent\n");
1589         }
1590
1591         if (last_byte <= start) {
1592                 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1593                 goto done;
1594         }
1595
1596         set_extent_uptodate(tree, start, page_end, GFP_NOFS);
1597         blocksize = inode->i_sb->s_blocksize;
1598
1599         while (cur <= end) {
1600                 if (cur >= last_byte) {
1601                         clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
1602                         break;
1603                 }
1604                 em = get_extent(inode, page, page_offset, cur, end, 0);
1605                 if (IS_ERR(em) || !em) {
1606                         SetPageError(page);
1607                         break;
1608                 }
1609
1610                 extent_offset = cur - em->start;
1611                 BUG_ON(em->end < cur);
1612                 BUG_ON(end < cur);
1613                 iosize = min(em->end - cur, end - cur) + 1;
1614                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1615                 sector = (em->block_start + extent_offset) >> 9;
1616                 bdev = em->bdev;
1617                 block_start = em->block_start;
1618                 free_extent_map(em);
1619                 em = NULL;
1620
1621                 if (block_start == EXTENT_MAP_HOLE ||
1622                     block_start == EXTENT_MAP_INLINE) {
1623                         clear_extent_dirty(tree, cur,
1624                                            cur + iosize - 1, GFP_NOFS);
1625                         cur = cur + iosize;
1626                         page_offset += iosize;
1627                         continue;
1628                 }
1629
1630                 /* leave this out until we have a page_mkwrite call */
1631                 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
1632                                    EXTENT_DIRTY, 0)) {
1633                         cur = cur + iosize;
1634                         page_offset += iosize;
1635                         continue;
1636                 }
1637                 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
1638                 if (tree->ops && tree->ops->writepage_io_hook) {
1639                         ret = tree->ops->writepage_io_hook(page, cur,
1640                                                 cur + iosize - 1);
1641                 } else {
1642                         ret = 0;
1643                 }
1644                 if (ret)
1645                         SetPageError(page);
1646                 else {
1647                         set_range_writeback(tree, cur, cur + iosize - 1);
1648                         ret = submit_extent_page(WRITE, tree, page, sector,
1649                                                  iosize, page_offset, bdev,
1650                                                  end_bio_extent_writepage);
1651                         if (ret)
1652                                 SetPageError(page);
1653                 }
1654                 cur = cur + iosize;
1655                 page_offset += iosize;
1656                 nr++;
1657         }
1658 done:
1659         unlock_extent(tree, start, page_end, GFP_NOFS);
1660         unlock_page(page);
1661         return 0;
1662 }
1663 EXPORT_SYMBOL(extent_write_full_page);
1664
1665 /*
1666  * basic invalidatepage code, this waits on any locked or writeback
1667  * ranges corresponding to the page, and then deletes any extent state
1668  * records from the tree
1669  */
1670 int extent_invalidatepage(struct extent_map_tree *tree,
1671                           struct page *page, unsigned long offset)
1672 {
1673         u64 start = (page->index << PAGE_CACHE_SHIFT);
1674         u64 end = start + PAGE_CACHE_SIZE - 1;
1675         size_t blocksize = page->mapping->host->i_sb->s_blocksize;
1676
1677         start += (offset + blocksize -1) & ~(blocksize - 1);
1678         if (start > end)
1679                 return 0;
1680
1681         lock_extent(tree, start, end, GFP_NOFS);
1682         wait_on_extent_writeback(tree, start, end);
1683         clear_extent_bit(tree, start, end,
1684                          EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
1685                          1, 1, GFP_NOFS);
1686         return 0;
1687 }
1688 EXPORT_SYMBOL(extent_invalidatepage);
1689
1690 /*
1691  * simple commit_write call, set_range_dirty is used to mark both
1692  * the pages and the extent records as dirty
1693  */
1694 int extent_commit_write(struct extent_map_tree *tree,
1695                         struct inode *inode, struct page *page,
1696                         unsigned from, unsigned to)
1697 {
1698         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1699
1700         set_page_extent_mapped(page);
1701         set_page_dirty(page);
1702
1703         if (pos > inode->i_size) {
1704                 i_size_write(inode, pos);
1705                 mark_inode_dirty(inode);
1706         }
1707         return 0;
1708 }
1709 EXPORT_SYMBOL(extent_commit_write);
1710
1711 int extent_prepare_write(struct extent_map_tree *tree,
1712                          struct inode *inode, struct page *page,
1713                          unsigned from, unsigned to, get_extent_t *get_extent)
1714 {
1715         u64 page_start = page->index << PAGE_CACHE_SHIFT;
1716         u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
1717         u64 block_start;
1718         u64 orig_block_start;
1719         u64 block_end;
1720         u64 cur_end;
1721         struct extent_map *em;
1722         unsigned blocksize = 1 << inode->i_blkbits;
1723         size_t page_offset = 0;
1724         size_t block_off_start;
1725         size_t block_off_end;
1726         int err = 0;
1727         int iocount = 0;
1728         int ret = 0;
1729         int isnew;
1730
1731         set_page_extent_mapped(page);
1732
1733         block_start = (page_start + from) & ~((u64)blocksize - 1);
1734         block_end = (page_start + to - 1) | (blocksize - 1);
1735         orig_block_start = block_start;
1736
1737         lock_extent(tree, page_start, page_end, GFP_NOFS);
1738         while(block_start <= block_end) {
1739                 em = get_extent(inode, page, page_offset, block_start,
1740                                 block_end, 1);
1741                 if (IS_ERR(em) || !em) {
1742                         goto err;
1743                 }
1744                 cur_end = min(block_end, em->end);
1745                 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
1746                 block_off_end = block_off_start + blocksize;
1747                 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
1748
1749                 if (!PageUptodate(page) && isnew &&
1750                     (block_off_end > to || block_off_start < from)) {
1751                         void *kaddr;
1752
1753                         kaddr = kmap_atomic(page, KM_USER0);
1754                         if (block_off_end > to)
1755                                 memset(kaddr + to, 0, block_off_end - to);
1756                         if (block_off_start < from)
1757                                 memset(kaddr + block_off_start, 0,
1758                                        from - block_off_start);
1759                         flush_dcache_page(page);
1760                         kunmap_atomic(kaddr, KM_USER0);
1761                 }
1762                 if (!isnew && !PageUptodate(page) &&
1763                     (block_off_end > to || block_off_start < from) &&
1764                     !test_range_bit(tree, block_start, cur_end,
1765                                     EXTENT_UPTODATE, 1)) {
1766                         u64 sector;
1767                         u64 extent_offset = block_start - em->start;
1768                         size_t iosize;
1769                         sector = (em->block_start + extent_offset) >> 9;
1770                         iosize = (cur_end - block_start + blocksize - 1) &
1771                                 ~((u64)blocksize - 1);
1772                         /*
1773                          * we've already got the extent locked, but we
1774                          * need to split the state such that our end_bio
1775                          * handler can clear the lock.
1776                          */
1777                         set_extent_bit(tree, block_start,
1778                                        block_start + iosize - 1,
1779                                        EXTENT_LOCKED, 0, NULL, GFP_NOFS);
1780                         ret = submit_extent_page(READ, tree, page,
1781                                          sector, iosize, page_offset, em->bdev,
1782                                          end_bio_extent_preparewrite);
1783                         iocount++;
1784                         block_start = block_start + iosize;
1785                 } else {
1786                         set_extent_uptodate(tree, block_start, cur_end,
1787                                             GFP_NOFS);
1788                         unlock_extent(tree, block_start, cur_end, GFP_NOFS);
1789                         block_start = cur_end + 1;
1790                 }
1791                 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
1792                 free_extent_map(em);
1793         }
1794         if (iocount) {
1795                 wait_extent_bit(tree, orig_block_start,
1796                                 block_end, EXTENT_LOCKED);
1797         }
1798         check_page_uptodate(tree, page);
1799 err:
1800         /* FIXME, zero out newly allocated blocks on error */
1801         return err;
1802 }
1803 EXPORT_SYMBOL(extent_prepare_write);
1804
1805 /*
1806  * a helper for releasepage.  As long as there are no locked extents
1807  * in the range corresponding to the page, both state records and extent
1808  * map records are removed
1809  */
1810 int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page)
1811 {
1812         struct extent_map *em;
1813         u64 start = page->index << PAGE_CACHE_SHIFT;
1814         u64 end = start + PAGE_CACHE_SIZE - 1;
1815         u64 orig_start = start;
1816         int ret = 1;
1817
1818         while (start <= end) {
1819                 em = lookup_extent_mapping(tree, start, end);
1820                 if (!em || IS_ERR(em))
1821                         break;
1822                 if (!test_range_bit(tree, em->start, em->end,
1823                                     EXTENT_LOCKED, 0)) {
1824                         remove_extent_mapping(tree, em);
1825                         /* once for the rb tree */
1826                         free_extent_map(em);
1827                 }
1828                 start = em->end + 1;
1829                 /* once for us */
1830                 free_extent_map(em);
1831         }
1832         if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0))
1833                 ret = 0;
1834         else
1835                 clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
1836                                  1, 1, GFP_NOFS);
1837         return ret;
1838 }
1839 EXPORT_SYMBOL(try_release_extent_mapping);
1840
1841 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
1842                 get_extent_t *get_extent)
1843 {
1844         struct inode *inode = mapping->host;
1845         u64 start = iblock << inode->i_blkbits;
1846         u64 end = start + (1 << inode->i_blkbits) - 1;
1847         struct extent_map *em;
1848
1849         em = get_extent(inode, NULL, 0, start, end, 0);
1850         if (!em || IS_ERR(em))
1851                 return 0;
1852
1853         // XXX(hch): block 0 is valid in some cases, e.g. XFS RT device
1854         if (em->block_start == EXTENT_MAP_INLINE ||
1855             em->block_start == EXTENT_MAP_HOLE)
1856                 return 0;
1857
1858         return (em->block_start + start - em->start) >> inode->i_blkbits;
1859 }
1860
1861 struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
1862                                           u64 start, unsigned long len,
1863                                           gfp_t mask)
1864 {
1865         unsigned long num_pages = ((start + len - 1) >> PAGE_CACHE_SHIFT) -
1866                                   (start >> PAGE_CACHE_SHIFT) + 1;
1867         unsigned long i;
1868         unsigned long index = start >> PAGE_CACHE_SHIFT;
1869         struct extent_buffer *eb;
1870         struct page *p;
1871         struct address_space *mapping = tree->mapping;
1872         int uptodate = 0;
1873
1874         eb = kzalloc(EXTENT_BUFFER_SIZE(num_pages), mask);
1875         if (!eb || IS_ERR(eb))
1876                 return NULL;
1877
1878         eb->start = start;
1879         eb->len = len;
1880         atomic_set(&eb->refs, 1);
1881
1882         for (i = 0; i < num_pages; i++, index++) {
1883                 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
1884                 if (!p)
1885                         goto fail;
1886                 eb->pages[i] = p;
1887                 if (!PageUptodate(p))
1888                         uptodate = 0;
1889                 unlock_page(p);
1890         }
1891         if (uptodate)
1892                 eb->flags |= EXTENT_UPTODATE;
1893         return eb;
1894 fail:
1895         free_extent_buffer(eb);
1896         return NULL;
1897 }
1898 EXPORT_SYMBOL(alloc_extent_buffer);
1899
1900 struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
1901                                          u64 start, unsigned long len,
1902                                           gfp_t mask)
1903 {
1904         unsigned long num_pages = ((start + len - 1) >> PAGE_CACHE_SHIFT) -
1905                                   (start >> PAGE_CACHE_SHIFT) + 1;
1906         unsigned long i;
1907         unsigned long index = start >> PAGE_CACHE_SHIFT;
1908         struct extent_buffer *eb;
1909         struct page *p;
1910         struct address_space *mapping = tree->mapping;
1911
1912         eb = kzalloc(EXTENT_BUFFER_SIZE(num_pages), mask);
1913         if (!eb || IS_ERR(eb))
1914                 return NULL;
1915
1916         eb->start = start;
1917         eb->len = len;
1918         atomic_set(&eb->refs, 1);
1919
1920         for (i = 0; i < num_pages; i++, index++) {
1921                 p = find_get_page(mapping, index);
1922                 if (!p)
1923                         goto fail;
1924                 eb->pages[i] = p;
1925         }
1926         return eb;
1927 fail:
1928         free_extent_buffer(eb);
1929         return NULL;
1930 }
1931 EXPORT_SYMBOL(find_extent_buffer);
1932
1933 void free_extent_buffer(struct extent_buffer *eb)
1934 {
1935         unsigned long i;
1936         unsigned long num_pages;
1937
1938         if (!eb)
1939                 return;
1940
1941         if (!atomic_dec_and_test(&eb->refs))
1942                 return;
1943
1944         num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) -
1945                 (eb->start >> PAGE_CACHE_SHIFT) + 1;
1946
1947         for (i = 0; i < num_pages; i++) {
1948                 if (eb->pages[i])
1949                         page_cache_release(eb->pages[i]);
1950         }
1951         kfree(eb);
1952 }
1953 EXPORT_SYMBOL(free_extent_buffer);
1954
1955 int clear_extent_buffer_dirty(struct extent_map_tree *tree,
1956                               struct extent_buffer *eb)
1957 {
1958         int set;
1959         unsigned long i;
1960         unsigned long num_pages;
1961         struct page *page;
1962
1963         u64 start = eb->start;
1964         u64 end = start + eb->len - 1;
1965
1966         set = clear_extent_dirty(tree, start, end, GFP_NOFS);
1967         num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) -
1968                 (eb->start >> PAGE_CACHE_SHIFT) + 1;
1969
1970         for (i = 0; i < num_pages; i++) {
1971                 page = eb->pages[i];
1972                 lock_page(page);
1973                 /*
1974                  * if we're on the last page or the first page and the
1975                  * block isn't aligned on a page boundary, do extra checks
1976                  * to make sure we don't clean page that is partially dirty
1977                  */
1978                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
1979                     ((i == num_pages - 1) &&
1980                      ((eb->start + eb->len - 1) & (PAGE_CACHE_SIZE - 1)))) {
1981                         start = page->index << PAGE_CACHE_SHIFT;
1982                         end  = start + PAGE_CACHE_SIZE - 1;
1983                         if (test_range_bit(tree, start, end,
1984                                            EXTENT_DIRTY, 0)) {
1985                                 unlock_page(page);
1986                                 continue;
1987                         }
1988                 }
1989                 clear_page_dirty_for_io(page);
1990                 unlock_page(page);
1991         }
1992         return 0;
1993 }
1994 EXPORT_SYMBOL(clear_extent_buffer_dirty);
1995
1996 int wait_on_extent_buffer_writeback(struct extent_map_tree *tree,
1997                                     struct extent_buffer *eb)
1998 {
1999         return wait_on_extent_writeback(tree, eb->start,
2000                                         eb->start + eb->len - 1);
2001 }
2002 EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
2003
2004 int set_extent_buffer_dirty(struct extent_map_tree *tree,
2005                              struct extent_buffer *eb)
2006 {
2007         return set_range_dirty(tree, eb->start, eb->start + eb->len - 1);
2008 }
2009 EXPORT_SYMBOL(set_extent_buffer_dirty);
2010
2011 int set_extent_buffer_uptodate(struct extent_map_tree *tree,
2012                                 struct extent_buffer *eb)
2013 {
2014         unsigned long i;
2015         struct page *page;
2016         unsigned long num_pages;
2017
2018         num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) -
2019                 (eb->start >> PAGE_CACHE_SHIFT) + 1;
2020
2021         set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2022                             GFP_NOFS);
2023         for (i = 0; i < num_pages; i++) {
2024                 page = eb->pages[i];
2025                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2026                     ((i == num_pages - 1) &&
2027                      ((eb->start + eb->len - 1) & (PAGE_CACHE_SIZE - 1)))) {
2028                         check_page_uptodate(tree, page);
2029                         continue;
2030                 }
2031                 SetPageUptodate(page);
2032         }
2033         return 0;
2034 }
2035 EXPORT_SYMBOL(set_extent_buffer_uptodate);
2036
2037 int extent_buffer_uptodate(struct extent_map_tree *tree,
2038                              struct extent_buffer *eb)
2039 {
2040         if (eb->flags & EXTENT_UPTODATE)
2041                 return 1;
2042         return test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2043                            EXTENT_UPTODATE, 1);
2044 }
2045 EXPORT_SYMBOL(extent_buffer_uptodate);
2046
2047 int read_extent_buffer_pages(struct extent_map_tree *tree,
2048                              struct extent_buffer *eb, int wait)
2049 {
2050         unsigned long i;
2051         struct page *page;
2052         int err;
2053         int ret = 0;
2054         unsigned long num_pages;
2055
2056         if (eb->flags & EXTENT_UPTODATE)
2057                 return 0;
2058
2059         if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2060                            EXTENT_UPTODATE, 1)) {
2061                 return 0;
2062         }
2063
2064         num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) -
2065                 (eb->start >> PAGE_CACHE_SHIFT) + 1;
2066         for (i = 0; i < num_pages; i++) {
2067                 page = eb->pages[i];
2068                 if (PageUptodate(page)) {
2069                         continue;
2070                 }
2071                 if (!wait) {
2072                         if (TestSetPageLocked(page)) {
2073                                 continue;
2074                         }
2075                 } else {
2076                         lock_page(page);
2077                 }
2078                 if (!PageUptodate(page)) {
2079                         err = page->mapping->a_ops->readpage(NULL, page);
2080                         if (err) {
2081                                 ret = err;
2082                         }
2083                 } else {
2084                         unlock_page(page);
2085                 }
2086         }
2087
2088         if (ret || !wait) {
2089                 return ret;
2090         }
2091
2092         for (i = 0; i < num_pages; i++) {
2093                 page = eb->pages[i];
2094                 wait_on_page_locked(page);
2095                 if (!PageUptodate(page)) {
2096                         ret = -EIO;
2097                 }
2098         }
2099         eb->flags |= EXTENT_UPTODATE;
2100         return ret;
2101 }
2102 EXPORT_SYMBOL(read_extent_buffer_pages);
2103
2104 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
2105                         unsigned long start,
2106                         unsigned long len)
2107 {
2108         size_t cur;
2109         size_t offset;
2110         struct page *page;
2111         char *kaddr;
2112         char *dst = (char *)dstv;
2113         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2114         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2115
2116         WARN_ON(start > eb->len);
2117         WARN_ON(start + len > eb->start + eb->len);
2118
2119         page = eb->pages[i];
2120         offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
2121         if (i == 0)
2122                 offset += start_offset;
2123
2124         while(len > 0) {
2125                 WARN_ON(!PageUptodate(page));
2126
2127                 cur = min(len, (PAGE_CACHE_SIZE - offset));
2128                 // kaddr = kmap_atomic(page, KM_USER0);
2129                 kaddr = page_address(page);
2130                 memcpy(dst, kaddr + offset, cur);
2131                 // kunmap_atomic(kaddr, KM_USER0);
2132
2133                 dst += cur;
2134                 len -= cur;
2135                 offset = 0;
2136                 i++;
2137                 page = eb->pages[i];
2138         }
2139 }
2140 EXPORT_SYMBOL(read_extent_buffer);
2141
2142 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
2143                       char **token, char **map,
2144                       unsigned long *map_start,
2145                       unsigned long *map_len, int km)
2146 {
2147         size_t offset;
2148         char *kaddr;
2149         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2150         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2151
2152         WARN_ON(start > eb->len);
2153
2154         if (i == 0) {
2155                 offset = start_offset;
2156                 *map_start = 0;
2157         } else {
2158                 offset = 0;
2159                 *map_start = (i << PAGE_CACHE_SHIFT) - offset;
2160         }
2161
2162         // kaddr = kmap_atomic(eb->pages[i], km);
2163         kaddr = page_address(eb->pages[i]);
2164         *token = kaddr;
2165         *map = kaddr + offset;
2166         *map_len = PAGE_CACHE_SIZE - offset;
2167         return 0;
2168 }
2169 EXPORT_SYMBOL(map_extent_buffer);
2170
2171 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
2172 {
2173         // kunmap_atomic(token, km);
2174 }
2175 EXPORT_SYMBOL(unmap_extent_buffer);
2176
2177 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
2178                           unsigned long start,
2179                           unsigned long len)
2180 {
2181         size_t cur;
2182         size_t offset;
2183         struct page *page;
2184         char *kaddr;
2185         char *ptr = (char *)ptrv;
2186         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2187         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2188         int ret = 0;
2189
2190         WARN_ON(start > eb->len);
2191         WARN_ON(start + len > eb->start + eb->len);
2192
2193         page = eb->pages[i];
2194         offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
2195         if (i == 0)
2196                 offset += start_offset;
2197
2198         while(len > 0) {
2199                 WARN_ON(!PageUptodate(page));
2200
2201                 cur = min(len, (PAGE_CACHE_SIZE - offset));
2202
2203                 // kaddr = kmap_atomic(page, KM_USER0);
2204                 kaddr = page_address(page);
2205                 ret = memcmp(ptr, kaddr + offset, cur);
2206                 // kunmap_atomic(kaddr, KM_USER0);
2207                 if (ret)
2208                         break;
2209
2210                 ptr += cur;
2211                 len -= cur;
2212                 offset = 0;
2213                 i++;
2214                 page = eb->pages[i];
2215         }
2216         return ret;
2217 }
2218 EXPORT_SYMBOL(memcmp_extent_buffer);
2219
2220 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
2221                          unsigned long start, unsigned long len)
2222 {
2223         size_t cur;
2224         size_t offset;
2225         struct page *page;
2226         char *kaddr;
2227         char *src = (char *)srcv;
2228         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2229         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2230
2231         WARN_ON(start > eb->len);
2232         WARN_ON(start + len > eb->start + eb->len);
2233
2234         page = eb->pages[i];
2235         offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
2236         if (i == 0)
2237                 offset += start_offset;
2238
2239         while(len > 0) {
2240                 WARN_ON(!PageUptodate(page));
2241
2242                 cur = min(len, PAGE_CACHE_SIZE - offset);
2243                 // kaddr = kmap_atomic(page, KM_USER0);
2244                 kaddr = page_address(page);
2245                 memcpy(kaddr + offset, src, cur);
2246                 // kunmap_atomic(kaddr, KM_USER0);
2247
2248                 src += cur;
2249                 len -= cur;
2250                 offset = 0;
2251                 i++;
2252                 page = eb->pages[i];
2253         }
2254 }
2255 EXPORT_SYMBOL(write_extent_buffer);
2256
2257 void memset_extent_buffer(struct extent_buffer *eb, char c,
2258                           unsigned long start, unsigned long len)
2259 {
2260         size_t cur;
2261         size_t offset;
2262         struct page *page;
2263         char *kaddr;
2264         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2265         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2266
2267         WARN_ON(start > eb->len);
2268         WARN_ON(start + len > eb->start + eb->len);
2269
2270         page = eb->pages[i];
2271         offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
2272         if (i == 0)
2273                 offset += start_offset;
2274
2275         while(len > 0) {
2276                 WARN_ON(!PageUptodate(page));
2277
2278                 cur = min(len, PAGE_CACHE_SIZE - offset);
2279                 // kaddr = kmap_atomic(page, KM_USER0);
2280                 kaddr = page_address(page);
2281                 memset(kaddr + offset, c, cur);
2282                 // kunmap_atomic(kaddr, KM_USER0);
2283
2284                 len -= cur;
2285                 offset = 0;
2286                 i++;
2287                 page = eb->pages[i];
2288         }
2289 }
2290 EXPORT_SYMBOL(memset_extent_buffer);
2291
2292 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
2293                         unsigned long dst_offset, unsigned long src_offset,
2294                         unsigned long len)
2295 {
2296         u64 dst_len = dst->len;
2297         size_t cur;
2298         size_t offset;
2299         struct page *page;
2300         char *kaddr;
2301         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
2302         unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
2303
2304         WARN_ON(src->len != dst_len);
2305
2306         offset = dst_offset & ((unsigned long)PAGE_CACHE_SIZE - 1);
2307         if (i == 0)
2308                 offset += start_offset;
2309
2310         while(len > 0) {
2311                 page = dst->pages[i];
2312                 WARN_ON(!PageUptodate(page));
2313
2314                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
2315
2316                 // kaddr = kmap_atomic(page, KM_USER1);
2317                 kaddr = page_address(page);
2318                 read_extent_buffer(src, kaddr + offset, src_offset, cur);
2319                 // kunmap_atomic(kaddr, KM_USER1);
2320
2321                 src_offset += cur;
2322                 len -= cur;
2323                 offset = 0;
2324                 i++;
2325         }
2326 }
2327 EXPORT_SYMBOL(copy_extent_buffer);
2328
2329 static void move_pages(struct page *dst_page, struct page *src_page,
2330                        unsigned long dst_off, unsigned long src_off,
2331                        unsigned long len)
2332 {
2333         // char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
2334         char *dst_kaddr = page_address(dst_page);
2335         if (dst_page == src_page) {
2336                 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
2337         } else {
2338                 // char *src_kaddr = kmap_atomic(src_page, KM_USER1);
2339                 char *src_kaddr = page_address(src_page);
2340                 char *p = dst_kaddr + dst_off + len;
2341                 char *s = src_kaddr + src_off + len;
2342
2343                 while (len--)
2344                         *--p = *--s;
2345
2346                 // kunmap_atomic(src_kaddr, KM_USER1);
2347         }
2348         // kunmap_atomic(dst_kaddr, KM_USER0);
2349 }
2350
2351 static void copy_pages(struct page *dst_page, struct page *src_page,
2352                        unsigned long dst_off, unsigned long src_off,
2353                        unsigned long len)
2354 {
2355         //kmap_atomic(dst_page, KM_USER0);
2356         char *dst_kaddr = page_address(dst_page);
2357         char *src_kaddr;
2358
2359         if (dst_page != src_page)
2360                 src_kaddr = page_address(src_page); // kmap_atomic(src_page, KM_USER1);
2361         else
2362                 src_kaddr = dst_kaddr;
2363
2364         memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
2365         /*
2366         kunmap_atomic(dst_kaddr, KM_USER0);
2367         if (dst_page != src_page)
2368                 kunmap_atomic(src_kaddr, KM_USER1);
2369         */
2370 }
2371
2372 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
2373                            unsigned long src_offset, unsigned long len)
2374 {
2375         size_t cur;
2376         size_t dst_off_in_page;
2377         size_t src_off_in_page;
2378         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
2379         unsigned long dst_i;
2380         unsigned long src_i;
2381
2382         if (src_offset + len > dst->len) {
2383                 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
2384                        src_offset, len, dst->len);
2385                 BUG_ON(1);
2386         }
2387         if (dst_offset + len > dst->len) {
2388                 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
2389                        dst_offset, len, dst->len);
2390                 BUG_ON(1);
2391         }
2392
2393         while(len > 0) {
2394                 dst_off_in_page = dst_offset &
2395                         ((unsigned long)PAGE_CACHE_SIZE - 1);
2396                 src_off_in_page = src_offset &
2397                         ((unsigned long)PAGE_CACHE_SIZE - 1);
2398
2399                 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
2400                 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
2401
2402                 if (src_i == 0)
2403                         src_off_in_page += start_offset;
2404                 if (dst_i == 0)
2405                         dst_off_in_page += start_offset;
2406
2407                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
2408                                                src_off_in_page));
2409                 cur = min(cur, (unsigned long)(PAGE_CACHE_SIZE -
2410                                                dst_off_in_page));
2411
2412                 copy_pages(dst->pages[dst_i], dst->pages[src_i],
2413                            dst_off_in_page, src_off_in_page, cur);
2414
2415                 src_offset += cur;
2416                 dst_offset += cur;
2417                 len -= cur;
2418         }
2419 }
2420 EXPORT_SYMBOL(memcpy_extent_buffer);
2421
2422 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
2423                            unsigned long src_offset, unsigned long len)
2424 {
2425         size_t cur;
2426         size_t dst_off_in_page;
2427         size_t src_off_in_page;
2428         unsigned long dst_end = dst_offset + len - 1;
2429         unsigned long src_end = src_offset + len - 1;
2430         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
2431         unsigned long dst_i;
2432         unsigned long src_i;
2433
2434         if (src_offset + len > dst->len) {
2435                 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
2436                        src_offset, len, dst->len);
2437                 BUG_ON(1);
2438         }
2439         if (dst_offset + len > dst->len) {
2440                 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
2441                        dst_offset, len, dst->len);
2442                 BUG_ON(1);
2443         }
2444         if (dst_offset < src_offset) {
2445                 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
2446                 return;
2447         }
2448         while(len > 0) {
2449                 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
2450                 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
2451
2452                 dst_off_in_page = dst_end &
2453                         ((unsigned long)PAGE_CACHE_SIZE - 1);
2454                 src_off_in_page = src_end &
2455                         ((unsigned long)PAGE_CACHE_SIZE - 1);
2456
2457                 if (src_i == 0)
2458                         src_off_in_page += start_offset;
2459                 if (dst_i == 0)
2460                         dst_off_in_page += start_offset;
2461
2462                 cur = min(len, src_off_in_page + 1);
2463                 cur = min(cur, dst_off_in_page + 1);
2464 // printk("move pages orig dst %lu src %lu len %lu, this %lu %lu %lu\n", dst_offset, src_offset, len, dst_off_in_page - cur + 1, src_off_in_page - cur + 1, cur);
2465                 move_pages(dst->pages[dst_i], dst->pages[src_i],
2466                            dst_off_in_page - cur + 1,
2467                            src_off_in_page - cur + 1, cur);
2468
2469                 dst_end -= cur - 1;
2470                 src_end -= cur - 1;
2471                 len -= cur;
2472         }
2473 }
2474 EXPORT_SYMBOL(memmove_extent_buffer);