Btrfs: Fix an off by one in the extent_map prepare write code

[safe/jmp/linux-2.6] / fs / btrfs / extent_map.c
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c

index b6a4974..f53d8c9 100644 (file)
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -42,6 +42,7 @@ struct extent_page_data {
         struct extent_map_tree *tree;
         get_extent_t *get_extent;
  };
+
  int __init extent_map_init(void)
  {
         extent_map_cache = btrfs_cache_create("extent_map",
@@ -68,7 +69,7 @@ free_map_cache:
         return -ENOMEM;
  }
  
-void __exit extent_map_exit(void)
+void extent_map_exit(void)
  {
         struct extent_state *state;
  
@@ -94,6 +95,7 @@ void extent_map_tree_init(struct extent_map_tree *tree,
         tree->map.rb_node = NULL;
         tree->state.rb_node = NULL;
         tree->ops = NULL;
+       tree->dirty_bytes = 0;
         rwlock_init(&tree->lock);
         spin_lock_init(&tree->lru_lock);
         tree->mapping = mapping;
@@ -414,6 +416,8 @@ static int insert_state(struct extent_map_tree *tree,
                 printk("end < start %Lu %Lu\n", end, start);
                 WARN_ON(1);
         }
+       if (bits & EXTENT_DIRTY)
+               tree->dirty_bytes += end - start + 1;
         state->state |= bits;
         state->start = start;
         state->end = end;
@@ -476,6 +480,12 @@ static int clear_state_bit(struct extent_map_tree *tree,
                             int delete)
  {
         int ret = state->state & bits;
+
+       if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+               u64 range = state->end - state->start + 1;
+               WARN_ON(range > tree->dirty_bytes);
+               tree->dirty_bytes -= range;
+       }
         state->state &= ~bits;
         if (wake)
                 wake_up(&state->wq);
@@ -668,6 +678,17 @@ out:
  }
  EXPORT_SYMBOL(wait_extent_bit);
  
+static void set_state_bits(struct extent_map_tree *tree,
+                          struct extent_state *state,
+                          int bits)
+{
+       if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+               u64 range = state->end - state->start + 1;
+               tree->dirty_bytes += range;
+       }
+       state->state |= bits;
+}
+
  /*
   * set some bits on a range in the tree.  This may require allocations
   * or sleeping, so the gfp mask is used to indicate what is allowed.
@@ -727,7 +748,7 @@ again:
                         err = -EEXIST;
                         goto out;
                 }
-               state->state |= bits;
+               set_state_bits(tree, state, bits);
                 start = state->end + 1;
                 merge_state(tree, state);
                 goto search_again;
@@ -762,7 +783,7 @@ again:
                 if (err)
                         goto out;
                 if (state->end <= end) {
-                       state->state |= bits;
+                       set_state_bits(tree, state, bits);
                         start = state->end + 1;
                         merge_state(tree, state);
                 } else {
@@ -808,7 +829,7 @@ again:
                 err = split_state(tree, state, prealloc, end + 1);
                 BUG_ON(err == -EEXIST);
  
-               prealloc->state |= bits;
+               set_state_bits(tree, prealloc, bits);
                 merge_state(tree, prealloc);
                 prealloc = NULL;
                 goto out;
@@ -1049,6 +1070,7 @@ u64 find_lock_delalloc_range(struct extent_map_tree *tree,
  search_again:
         node = tree_search(&tree->state, cur_start);
         if (!node || IS_ERR(node)) {
+               *end = (u64)-1;
                 goto out;
         }
  
@@ -1058,6 +1080,8 @@ search_again:
                         goto out;
                 }
                 if (!(state->state & EXTENT_DELALLOC)) {
+                       if (!found)
+                               *end = state->end;
                         goto out;
                 }
                 if (!found) {
@@ -1107,7 +1131,8 @@ out:
  }
  
  u64 count_range_bits(struct extent_map_tree *tree,
-                    u64 *start, u64 max_bytes, unsigned long bits)
+                    u64 *start, u64 search_end, u64 max_bytes,
+                    unsigned long bits)
  {
         struct rb_node *node;
         struct extent_state *state;
@@ -1115,7 +1140,17 @@ u64 count_range_bits(struct extent_map_tree *tree,
         u64 total_bytes = 0;
         int found = 0;
  
+       if (search_end <= cur_start) {
+               printk("search_end %Lu start %Lu\n", search_end, cur_start);
+               WARN_ON(1);
+               return 0;
+       }
+
         write_lock_irq(&tree->lock);
+       if (cur_start == 0 && bits == EXTENT_DIRTY) {
+               total_bytes = tree->dirty_bytes;
+               goto out;
+       }
         /*
          * this search will find all the extents that end after
          * our range starts.
@@ -1127,8 +1162,11 @@ u64 count_range_bits(struct extent_map_tree *tree,
  
         while(1) {
                 state = rb_entry(node, struct extent_state, rb_node);
-               if ((state->state & bits)) {
-                       total_bytes += state->end - state->start + 1;
+               if (state->start > search_end)
+                       break;
+               if (state->end >= cur_start && (state->state & bits)) {
+                       total_bytes += min(search_end, state->end) + 1 -
+                                      max(cur_start, state->start);
                         if (total_bytes >= max_bytes)
                                 break;
                         if (!found) {
@@ -1144,7 +1182,6 @@ out:
         write_unlock_irq(&tree->lock);
         return total_bytes;
  }
-
  /*
   * helper function to lock both pages and extents in the tree.
   * pages must be locked first.
@@ -1559,8 +1596,18 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
  
  static int submit_one_bio(int rw, struct bio *bio)
  {
+       u64 maxsector;
         int ret = 0;
+
         bio_get(bio);
+
+        maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
+       if (maxsector < bio->bi_sector) {
+               printk("sector too large max %Lu got %llu\n", maxsector,
+                       (unsigned long long)bio->bi_sector);
+               WARN_ON(1);
+       }
+
         submit_bio(rw, bio);
         if (bio_flagged(bio, BIO_EOPNOTSUPP))
                 ret = -EOPNOTSUPP;
@@ -1652,8 +1699,12 @@ static int __extent_read_full_page(struct extent_map_tree *tree,
  
         while (cur <= end) {
                 if (cur >= last_byte) {
+                       char *userpage;
                         iosize = PAGE_CACHE_SIZE - page_offset;
-                       zero_user_page(page, page_offset, iosize, KM_USER0);
+                       userpage = kmap_atomic(page, KM_USER0);
+                       memset(userpage + page_offset, 0, iosize);
+                       flush_dcache_page(page);
+                       kunmap_atomic(userpage, KM_USER0);
                         set_extent_uptodate(tree, cur, cur + iosize - 1,
                                             GFP_NOFS);
                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
@@ -1681,7 +1732,12 @@ static int __extent_read_full_page(struct extent_map_tree *tree,
  
                 /* we've found a hole, just zero and go on */
                 if (block_start == EXTENT_MAP_HOLE) {
-                       zero_user_page(page, page_offset, iosize, KM_USER0);
+                       char *userpage;
+                       userpage = kmap_atomic(page, KM_USER0);
+                       memset(userpage + page_offset, 0, iosize);
+                       flush_dcache_page(page);
+                       kunmap_atomic(userpage, KM_USER0);
+
                         set_extent_uptodate(tree, cur, cur + iosize - 1,
                                             GFP_NOFS);
                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
@@ -1778,9 +1834,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
         }
  
         if (page->index == end_index) {
+               char *userpage;
+
                 size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
-               zero_user_page(page, offset,
-                              PAGE_CACHE_SIZE - offset, KM_USER0);
+
+               userpage = kmap_atomic(page, KM_USER0);
+               memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset);
+               flush_dcache_page(page);
+               kunmap_atomic(userpage, KM_USER0);
         }
  
         set_page_extent_mapped(page);
@@ -1791,8 +1852,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                 nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
                                                        &delalloc_end,
                                                        128 * 1024 * 1024);
-               if (nr_delalloc <= 0)
-                       break;
+               if (nr_delalloc == 0) {
+                       delalloc_start = delalloc_end + 1;
+                       continue;
+               }
                 tree->ops->fill_delalloc(inode, delalloc_start,
                                          delalloc_end);
                 clear_extent_bit(tree, delalloc_start,
@@ -1895,6 +1958,129 @@ done:
         return 0;
  }
  
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+
+/* Taken directly from 2.6.23 for 2.6.18 back port */
+typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
+                                void *data);
+
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space
+ * and write all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them.  If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+static int write_cache_pages(struct address_space *mapping,
+                     struct writeback_control *wbc, writepage_t writepage,
+                     void *data)
+{
+       struct backing_dev_info *bdi = mapping->backing_dev_info;
+       int ret = 0;
+       int done = 0;
+       struct pagevec pvec;
+       int nr_pages;
+       pgoff_t index;
+       pgoff_t end;            /* Inclusive */
+       int scanned = 0;
+       int range_whole = 0;
+
+       if (wbc->nonblocking && bdi_write_congested(bdi)) {
+               wbc->encountered_congestion = 1;
+               return 0;
+       }
+
+       pagevec_init(&pvec, 0);
+       if (wbc->range_cyclic) {
+               index = mapping->writeback_index; /* Start from prev offset */
+               end = -1;
+       } else {
+               index = wbc->range_start >> PAGE_CACHE_SHIFT;
+               end = wbc->range_end >> PAGE_CACHE_SHIFT;
+               if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+                       range_whole = 1;
+               scanned = 1;
+       }
+retry:
+       while (!done && (index <= end) &&
+              (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                                             PAGECACHE_TAG_DIRTY,
+                                             min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+               unsigned i;
+
+               scanned = 1;
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+
+                       /*
+                        * At this point we hold neither mapping->tree_lock nor
+                        * lock on the page itself: the page may be truncated or
+                        * invalidated (changing page->mapping to NULL), or even
+                        * swizzled back from swapper_space to tmpfs file
+                        * mapping
+                        */
+                       lock_page(page);
+
+                       if (unlikely(page->mapping != mapping)) {
+                               unlock_page(page);
+                               continue;
+                       }
+
+                       if (!wbc->range_cyclic && page->index > end) {
+                               done = 1;
+                               unlock_page(page);
+                               continue;
+                       }
+
+                       if (wbc->sync_mode != WB_SYNC_NONE)
+                               wait_on_page_writeback(page);
+
+                       if (PageWriteback(page) ||
+                           !clear_page_dirty_for_io(page)) {
+                               unlock_page(page);
+                               continue;
+                       }
+
+                       ret = (*writepage)(page, wbc, data);
+
+                       if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+                               unlock_page(page);
+                               ret = 0;
+                       }
+                       if (ret || (--(wbc->nr_to_write) <= 0))
+                               done = 1;
+                       if (wbc->nonblocking && bdi_write_congested(bdi)) {
+                               wbc->encountered_congestion = 1;
+                               done = 1;
+                       }
+               }
+               pagevec_release(&pvec);
+               cond_resched();
+       }
+       if (!scanned && !done) {
+               /*
+                * We hit the last page and there is more work to be done: wrap
+                * back to the start of the file
+                */
+               scanned = 1;
+               index = 0;
+               goto retry;
+       }
+       if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+               mapping->writeback_index = index;
+       return ret;
+}
+#endif
+
  int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
                           get_extent_t *get_extent,
                           struct writeback_control *wbc)
@@ -1919,18 +2105,20 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
         ret = __extent_writepage(page, wbc, &epd);
  
         write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
-       if (epd.bio)
+       if (epd.bio) {
                 submit_one_bio(WRITE, epd.bio);
+       }
         return ret;
  }
  EXPORT_SYMBOL(extent_write_full_page);
  
+
  int extent_writepages(struct extent_map_tree *tree,
                       struct address_space *mapping,
                       get_extent_t *get_extent,
                       struct writeback_control *wbc)
  {
-       int ret;
+       int ret = 0;
         struct extent_page_data epd = {
                 .bio = NULL,
                 .tree = tree,
@@ -1938,8 +2126,9 @@ int extent_writepages(struct extent_map_tree *tree,
         };
  
         ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
-       if (epd.bio)
+       if (epd.bio) {
                 submit_one_bio(WRITE, epd.bio);
+       }
         return ret;
  }
  EXPORT_SYMBOL(extent_writepages);
@@ -2080,7 +2269,9 @@ int extent_prepare_write(struct extent_map_tree *tree,
                         flush_dcache_page(page);
                         kunmap_atomic(kaddr, KM_USER0);
                 }
-               if (!isnew && !PageUptodate(page) &&
+               if ((em->block_start != EXTENT_MAP_HOLE &&
+                    em->block_start != EXTENT_MAP_INLINE) &&
+                   !isnew && !PageUptodate(page) &&
                     (block_off_end > to || block_off_start < from) &&
                     !test_range_bit(tree, block_start, cur_end,
                                     EXTENT_UPTODATE, 1)) {
@@ -2088,7 +2279,7 @@ int extent_prepare_write(struct extent_map_tree *tree,
                         u64 extent_offset = block_start - em->start;
                         size_t iosize;
                         sector = (em->block_start + extent_offset) >> 9;
-                       iosize = (cur_end - block_start + blocksize - 1) &
+                       iosize = (cur_end - block_start + blocksize) &
                                 ~((u64)blocksize - 1);
                         /*
                          * we've already got the extent locked, but we