#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
+#include <linux/version.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
#include "extent_map.h"
/* temporary define until extent_map moves out of btrfs */
struct rb_node rb_node;
};
-void __init extent_map_init(void)
+struct extent_page_data {
+ struct bio *bio;
+ struct extent_map_tree *tree;
+ get_extent_t *get_extent;
+};
+
+int __init extent_map_init(void)
{
extent_map_cache = btrfs_cache_create("extent_map",
sizeof(struct extent_map), 0,
NULL);
+ if (!extent_map_cache)
+ return -ENOMEM;
extent_state_cache = btrfs_cache_create("extent_state",
sizeof(struct extent_state), 0,
NULL);
+ if (!extent_state_cache)
+ goto free_map_cache;
extent_buffer_cache = btrfs_cache_create("extent_buffers",
sizeof(struct extent_buffer), 0,
NULL);
+ if (!extent_buffer_cache)
+ goto free_state_cache;
+ return 0;
+
+free_state_cache:
+ kmem_cache_destroy(extent_state_cache);
+free_map_cache:
+ kmem_cache_destroy(extent_map_cache);
+ return -ENOMEM;
}
-void __exit extent_map_exit(void)
+void extent_map_exit(void)
{
struct extent_state *state;
tree->map.rb_node = NULL;
tree->state.rb_node = NULL;
tree->ops = NULL;
+ tree->dirty_bytes = 0;
rwlock_init(&tree->lock);
spin_lock_init(&tree->lru_lock);
tree->mapping = mapping;
while(!list_empty(&tree->buffer_lru)) {
eb = list_entry(tree->buffer_lru.next, struct extent_buffer,
lru);
- list_del(&eb->lru);
+ list_del_init(&eb->lru);
free_extent_buffer(eb);
}
}
if (prev && prev->end + 1 == em->start &&
((em->block_start == EXTENT_MAP_HOLE &&
prev->block_start == EXTENT_MAP_HOLE) ||
- (em->block_start == prev->block_end + 1))) {
+ (em->block_start == EXTENT_MAP_INLINE &&
+ prev->block_start == EXTENT_MAP_INLINE) ||
+ (em->block_start == EXTENT_MAP_DELALLOC &&
+ prev->block_start == EXTENT_MAP_DELALLOC) ||
+ (em->block_start < EXTENT_MAP_DELALLOC - 1 &&
+ em->block_start == prev->block_end + 1))) {
em->start = prev->start;
em->block_start = prev->block_start;
rb_erase(&prev->rb_node, &tree->map);
printk("end < start %Lu %Lu\n", end, start);
WARN_ON(1);
}
+ if (bits & EXTENT_DIRTY)
+ tree->dirty_bytes += end - start + 1;
state->state |= bits;
state->start = start;
state->end = end;
int delete)
{
int ret = state->state & bits;
+
+ if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+ u64 range = state->end - state->start + 1;
+ WARN_ON(range > tree->dirty_bytes);
+ tree->dirty_bytes -= range;
+ }
state->state &= ~bits;
if (wake)
wake_up(&state->wq);
}
EXPORT_SYMBOL(wait_extent_bit);
+static void set_state_bits(struct extent_map_tree *tree,
+ struct extent_state *state,
+ int bits)
+{
+ if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+ u64 range = state->end - state->start + 1;
+ tree->dirty_bytes += range;
+ }
+ state->state |= bits;
+}
+
/*
* set some bits on a range in the tree. This may require allocations
* or sleeping, so the gfp mask is used to indicate what is allowed.
err = -EEXIST;
goto out;
}
- state->state |= bits;
+ set_state_bits(tree, state, bits);
start = state->end + 1;
merge_state(tree, state);
goto search_again;
if (err)
goto out;
if (state->end <= end) {
- state->state |= bits;
+ set_state_bits(tree, state, bits);
start = state->end + 1;
merge_state(tree, state);
} else {
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
- prealloc->state |= bits;
+ set_state_bits(tree, prealloc, bits);
merge_state(tree, prealloc);
prealloc = NULL;
goto out;
EXPORT_SYMBOL(find_first_extent_bit);
u64 find_lock_delalloc_range(struct extent_map_tree *tree,
- u64 start, u64 lock_start, u64 *end, u64 max_bytes)
+ u64 *start, u64 *end, u64 max_bytes)
{
struct rb_node *node;
struct extent_state *state;
- u64 cur_start = start;
+ u64 cur_start = *start;
u64 found = 0;
u64 total_bytes = 0;
search_again:
node = tree_search(&tree->state, cur_start);
if (!node || IS_ERR(node)) {
+ *end = (u64)-1;
goto out;
}
while(1) {
state = rb_entry(node, struct extent_state, rb_node);
- if (state->start != cur_start) {
+ if (found && state->start != cur_start) {
goto out;
}
if (!(state->state & EXTENT_DELALLOC)) {
+ if (!found)
+ *end = state->end;
goto out;
}
- if (state->start >= lock_start) {
- if (state->state & EXTENT_LOCKED) {
- DEFINE_WAIT(wait);
- atomic_inc(&state->refs);
- write_unlock_irq(&tree->lock);
- schedule();
- write_lock_irq(&tree->lock);
- finish_wait(&state->wq, &wait);
- free_extent_state(state);
- goto search_again;
+ if (!found) {
+ struct extent_state *prev_state;
+ struct rb_node *prev_node = node;
+ while(1) {
+ prev_node = rb_prev(prev_node);
+ if (!prev_node)
+ break;
+ prev_state = rb_entry(prev_node,
+ struct extent_state,
+ rb_node);
+ if (!(prev_state->state & EXTENT_DELALLOC))
+ break;
+ state = prev_state;
+ node = prev_node;
}
- state->state |= EXTENT_LOCKED;
}
+ if (state->state & EXTENT_LOCKED) {
+ DEFINE_WAIT(wait);
+ atomic_inc(&state->refs);
+ prepare_to_wait(&state->wq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ write_unlock_irq(&tree->lock);
+ schedule();
+ write_lock_irq(&tree->lock);
+ finish_wait(&state->wq, &wait);
+ free_extent_state(state);
+ goto search_again;
+ }
+ state->state |= EXTENT_LOCKED;
+ if (!found)
+ *start = state->start;
found++;
*end = state->end;
cur_start = state->end + 1;
node = rb_next(node);
if (!node)
break;
- total_bytes = state->end - state->start + 1;
+ total_bytes += state->end - state->start + 1;
if (total_bytes >= max_bytes)
break;
}
return found;
}
+u64 count_range_bits(struct extent_map_tree *tree,
+ u64 *start, u64 search_end, u64 max_bytes,
+ unsigned long bits)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+ u64 cur_start = *start;
+ u64 total_bytes = 0;
+ int found = 0;
+
+ if (search_end <= cur_start) {
+ printk("search_end %Lu start %Lu\n", search_end, cur_start);
+ WARN_ON(1);
+ return 0;
+ }
+
+ write_lock_irq(&tree->lock);
+ if (cur_start == 0 && bits == EXTENT_DIRTY) {
+ total_bytes = tree->dirty_bytes;
+ goto out;
+ }
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(&tree->state, cur_start);
+ if (!node || IS_ERR(node)) {
+ goto out;
+ }
+
+ while(1) {
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->start > search_end)
+ break;
+ if (state->end >= cur_start && (state->state & bits)) {
+ total_bytes += min(search_end, state->end) + 1 -
+ max(cur_start, state->start);
+ if (total_bytes >= max_bytes)
+ break;
+ if (!found) {
+ *start = state->start;
+ found = 1;
+ }
+ }
+ node = rb_next(node);
+ if (!node)
+ break;
+ }
+out:
+ write_unlock_irq(&tree->lock);
+ return total_bytes;
+}
/*
* helper function to lock both pages and extents in the tree.
* pages must be locked first.
node = tree_search(&tree->state, start);
while (node && start <= end) {
state = rb_entry(node, struct extent_state, rb_node);
- if (state->start > end)
- break;
if (filled && state->start > start) {
bitset = 0;
break;
}
+
+ if (state->start > end)
+ break;
+
if (state->state & bits) {
bitset = 1;
if (!filled)
static int check_page_uptodate(struct extent_map_tree *tree,
struct page *page)
{
- u64 start = page->index << PAGE_CACHE_SHIFT;
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
SetPageUptodate(page);
static int check_page_locked(struct extent_map_tree *tree,
struct page *page)
{
- u64 start = page->index << PAGE_CACHE_SHIFT;
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
unlock_page(page);
static int check_page_writeback(struct extent_map_tree *tree,
struct page *page)
{
- u64 start = page->index << PAGE_CACHE_SHIFT;
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
end_page_writeback(page);
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+static void end_bio_extent_writepage(struct bio *bio, int err)
+#else
static int end_bio_extent_writepage(struct bio *bio,
unsigned int bytes_done, int err)
+#endif
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
u64 end;
int whole_page;
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
if (bio->bi_size)
return 1;
+#endif
do {
struct page *page = bvec->bv_page;
- start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+ start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+ bvec->bv_offset;
end = start + bvec->bv_len - 1;
if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
} while (bvec >= bio->bi_io_vec);
bio_put(bio);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
return 0;
+#endif
}
/*
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+static void end_bio_extent_readpage(struct bio *bio, int err)
+#else
static int end_bio_extent_readpage(struct bio *bio,
unsigned int bytes_done, int err)
+#endif
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
int whole_page;
int ret;
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
if (bio->bi_size)
return 1;
+#endif
do {
struct page *page = bvec->bv_page;
- start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+ start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+ bvec->bv_offset;
end = start + bvec->bv_len - 1;
if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
} while (bvec >= bio->bi_io_vec);
bio_put(bio);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
return 0;
+#endif
}
/*
* the structs in the extent tree when done, and set the uptodate bits
* as appropriate.
*/
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+static void end_bio_extent_preparewrite(struct bio *bio, int err)
+#else
static int end_bio_extent_preparewrite(struct bio *bio,
unsigned int bytes_done, int err)
+#endif
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
u64 start;
u64 end;
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
if (bio->bi_size)
return 1;
+#endif
do {
struct page *page = bvec->bv_page;
- start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+ start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+ bvec->bv_offset;
end = start + bvec->bv_len - 1;
if (--bvec >= bio->bi_io_vec)
} while (bvec >= bio->bi_io_vec);
bio_put(bio);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
return 0;
+#endif
}
-static int submit_extent_page(int rw, struct extent_map_tree *tree,
- struct page *page, sector_t sector,
- size_t size, unsigned long offset,
- struct block_device *bdev,
- bio_end_io_t end_io_func)
+static struct bio *
+extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+ gfp_t gfp_flags)
{
struct bio *bio;
- int ret = 0;
- bio = bio_alloc(GFP_NOIO, 1);
+ bio = bio_alloc(gfp_flags, nr_vecs);
- bio->bi_sector = sector;
- bio->bi_bdev = bdev;
- bio->bi_io_vec[0].bv_page = page;
- bio->bi_io_vec[0].bv_len = size;
- bio->bi_io_vec[0].bv_offset = offset;
+ if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+ while (!bio && (nr_vecs /= 2))
+ bio = bio_alloc(gfp_flags, nr_vecs);
+ }
- bio->bi_vcnt = 1;
- bio->bi_idx = 0;
- bio->bi_size = size;
+ if (bio) {
+ bio->bi_bdev = bdev;
+ bio->bi_sector = first_sector;
+ }
+ return bio;
+}
- bio->bi_end_io = end_io_func;
- bio->bi_private = tree;
+static int submit_one_bio(int rw, struct bio *bio)
+{
+ u64 maxsector;
+ int ret = 0;
bio_get(bio);
- submit_bio(rw, bio);
+ maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
+ if (maxsector < bio->bi_sector) {
+ printk("sector too large max %Lu got %llu\n", maxsector,
+ (unsigned long long)bio->bi_sector);
+ WARN_ON(1);
+ }
+
+ submit_bio(rw, bio);
if (bio_flagged(bio, BIO_EOPNOTSUPP))
ret = -EOPNOTSUPP;
-
bio_put(bio);
return ret;
}
+static int submit_extent_page(int rw, struct extent_map_tree *tree,
+ struct page *page, sector_t sector,
+ size_t size, unsigned long offset,
+ struct block_device *bdev,
+ struct bio **bio_ret,
+ unsigned long max_pages,
+ bio_end_io_t end_io_func)
+{
+ int ret = 0;
+ struct bio *bio;
+ int nr;
+
+ if (bio_ret && *bio_ret) {
+ bio = *bio_ret;
+ if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
+ bio_add_page(bio, page, size, offset) < size) {
+ ret = submit_one_bio(rw, bio);
+ bio = NULL;
+ } else {
+ return 0;
+ }
+ }
+ nr = min_t(int, max_pages, bio_get_nr_vecs(bdev));
+ bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+ if (!bio) {
+ printk("failed to allocate bio nr %d\n", nr);
+ }
+ bio_add_page(bio, page, size, offset);
+ bio->bi_end_io = end_io_func;
+ bio->bi_private = tree;
+ if (bio_ret) {
+ *bio_ret = bio;
+ } else {
+ ret = submit_one_bio(rw, bio);
+ }
+
+ return ret;
+}
+
void set_page_extent_mapped(struct page *page)
{
if (!PagePrivate(page)) {
* into the tree that are removed when the IO is done (by the end_io
* handlers)
*/
-int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
- get_extent_t *get_extent)
+static int __extent_read_full_page(struct extent_map_tree *tree,
+ struct page *page,
+ get_extent_t *get_extent,
+ struct bio **bio)
{
struct inode *inode = page->mapping->host;
- u64 start = page->index << PAGE_CACHE_SHIFT;
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 page_end = start + PAGE_CACHE_SIZE - 1;
u64 end;
u64 cur = start;
while (cur <= end) {
if (cur >= last_byte) {
+ char *userpage;
iosize = PAGE_CACHE_SIZE - page_offset;
- zero_user_page(page, page_offset, iosize, KM_USER0);
+ userpage = kmap_atomic(page, KM_USER0);
+ memset(userpage + page_offset, 0, iosize);
+ flush_dcache_page(page);
+ kunmap_atomic(userpage, KM_USER0);
set_extent_uptodate(tree, cur, cur + iosize - 1,
GFP_NOFS);
unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
/* we've found a hole, just zero and go on */
if (block_start == EXTENT_MAP_HOLE) {
- zero_user_page(page, page_offset, iosize, KM_USER0);
+ char *userpage;
+ userpage = kmap_atomic(page, KM_USER0);
+ memset(userpage + page_offset, 0, iosize);
+ flush_dcache_page(page);
+ kunmap_atomic(userpage, KM_USER0);
+
set_extent_uptodate(tree, cur, cur + iosize - 1,
GFP_NOFS);
unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
cur + iosize - 1);
}
if (!ret) {
+ unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+ nr -= page->index;
ret = submit_extent_page(READ, tree, page,
- sector, iosize, page_offset,
- bdev, end_bio_extent_readpage);
+ sector, iosize, page_offset,
+ bdev, bio, nr,
+ end_bio_extent_readpage);
}
if (ret)
SetPageError(page);
}
return 0;
}
+
+int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
+ get_extent_t *get_extent)
+{
+ struct bio *bio = NULL;
+ int ret;
+
+ ret = __extent_read_full_page(tree, page, get_extent, &bio);
+ if (bio)
+ submit_one_bio(READ, bio);
+ return ret;
+}
EXPORT_SYMBOL(extent_read_full_page);
/*
* are found, they are marked writeback. Then the lock bits are removed
* and the end_io handler clears the writeback ranges
*/
-int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
- get_extent_t *get_extent,
- struct writeback_control *wbc)
+static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
{
struct inode *inode = page->mapping->host;
- u64 start = page->index << PAGE_CACHE_SHIFT;
+ struct extent_page_data *epd = data;
+ struct extent_map_tree *tree = epd->tree;
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 delalloc_start;
u64 page_end = start + PAGE_CACHE_SIZE - 1;
u64 end;
u64 cur = start;
u64 extent_offset;
u64 last_byte = i_size_read(inode);
u64 block_start;
+ u64 iosize;
sector_t sector;
struct extent_map *em;
struct block_device *bdev;
int ret;
int nr = 0;
size_t page_offset = 0;
- size_t iosize;
size_t blocksize;
loff_t i_size = i_size_read(inode);
unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
}
if (page->index == end_index) {
+ char *userpage;
+
size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
- zero_user_page(page, offset,
- PAGE_CACHE_SIZE - offset, KM_USER0);
+
+ userpage = kmap_atomic(page, KM_USER0);
+ memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset);
+ flush_dcache_page(page);
+ kunmap_atomic(userpage, KM_USER0);
}
set_page_extent_mapped(page);
- lock_extent(tree, start, page_end, GFP_NOFS);
- nr_delalloc = find_lock_delalloc_range(tree, start, page_end + 1,
- &delalloc_end,
- 128 * 1024 * 1024);
- if (nr_delalloc) {
- tree->ops->fill_delalloc(inode, start, delalloc_end);
- if (delalloc_end >= page_end + 1) {
- clear_extent_bit(tree, page_end + 1, delalloc_end,
- EXTENT_LOCKED | EXTENT_DELALLOC,
- 1, 0, GFP_NOFS);
- }
- clear_extent_bit(tree, start, page_end, EXTENT_DELALLOC,
- 0, 0, GFP_NOFS);
- if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
- printk("found delalloc bits after clear extent_bit\n");
+ delalloc_start = start;
+ delalloc_end = 0;
+ while(delalloc_end < page_end) {
+ nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
+ &delalloc_end,
+ 128 * 1024 * 1024);
+ if (nr_delalloc == 0) {
+ delalloc_start = delalloc_end + 1;
+ continue;
}
- } else if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
- printk("found delalloc bits after find_delalloc_range returns 0\n");
+ tree->ops->fill_delalloc(inode, delalloc_start,
+ delalloc_end);
+ clear_extent_bit(tree, delalloc_start,
+ delalloc_end,
+ EXTENT_LOCKED | EXTENT_DELALLOC,
+ 1, 0, GFP_NOFS);
+ delalloc_start = delalloc_end + 1;
}
+ lock_extent(tree, start, page_end, GFP_NOFS);
end = page_end;
if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
break;
}
- em = get_extent(inode, page, page_offset, cur, end, 0);
+ em = epd->get_extent(inode, page, page_offset, cur, end, 1);
if (IS_ERR(em) || !em) {
SetPageError(page);
break;
if (ret)
SetPageError(page);
else {
+ unsigned long max_nr = end_index + 1;
set_range_writeback(tree, cur, cur + iosize - 1);
+ if (!PageWriteback(page)) {
+ printk("warning page %lu not writeback, "
+ "cur %llu end %llu\n", page->index,
+ (unsigned long long)cur,
+ (unsigned long long)end);
+ }
+
ret = submit_extent_page(WRITE, tree, page, sector,
iosize, page_offset, bdev,
+ &epd->bio, max_nr,
end_bio_extent_writepage);
if (ret)
SetPageError(page);
nr++;
}
done:
+ if (nr == 0) {
+ /* make sure the mapping tag for page dirty gets cleared */
+ set_page_writeback(page);
+ end_page_writeback(page);
+ }
unlock_extent(tree, start, page_end, GFP_NOFS);
unlock_page(page);
return 0;
}
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+
+/* Taken directly from 2.6.23 for 2.6.18 back port */
+typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
+ void *data);
+
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space
+ * and write all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them. If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+static int write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc, writepage_t writepage,
+ void *data)
+{
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ int ret = 0;
+ int done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ int scanned = 0;
+ int range_whole = 0;
+
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
+ return 0;
+ }
+
+ pagevec_init(&pvec, 0);
+ if (wbc->range_cyclic) {
+ index = mapping->writeback_index; /* Start from prev offset */
+ end = -1;
+ } else {
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+ scanned = 1;
+ }
+retry:
+ while (!done && (index <= end) &&
+ (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+ unsigned i;
+
+ scanned = 1;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * At this point we hold neither mapping->tree_lock nor
+ * lock on the page itself: the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or even
+ * swizzled back from swapper_space to tmpfs file
+ * mapping
+ */
+ lock_page(page);
+
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (!wbc->range_cyclic && page->index > end) {
+ done = 1;
+ unlock_page(page);
+ continue;
+ }
+
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+
+ if (PageWriteback(page) ||
+ !clear_page_dirty_for_io(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ ret = (*writepage)(page, wbc, data);
+
+ if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+ unlock_page(page);
+ ret = 0;
+ }
+ if (ret || (--(wbc->nr_to_write) <= 0))
+ done = 1;
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
+ done = 1;
+ }
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ if (!scanned && !done) {
+ /*
+ * We hit the last page and there is more work to be done: wrap
+ * back to the start of the file
+ */
+ scanned = 1;
+ index = 0;
+ goto retry;
+ }
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ mapping->writeback_index = index;
+ return ret;
+}
+#endif
+
+int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
+ get_extent_t *get_extent,
+ struct writeback_control *wbc)
+{
+ int ret;
+ struct address_space *mapping = page->mapping;
+ struct extent_page_data epd = {
+ .bio = NULL,
+ .tree = tree,
+ .get_extent = get_extent,
+ };
+ struct writeback_control wbc_writepages = {
+ .bdi = wbc->bdi,
+ .sync_mode = WB_SYNC_NONE,
+ .older_than_this = NULL,
+ .nr_to_write = 64,
+ .range_start = page_offset(page) + PAGE_CACHE_SIZE,
+ .range_end = (loff_t)-1,
+ };
+
+
+ ret = __extent_writepage(page, wbc, &epd);
+
+ write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
+ if (epd.bio) {
+ submit_one_bio(WRITE, epd.bio);
+ }
+ return ret;
+}
EXPORT_SYMBOL(extent_write_full_page);
+
+int extent_writepages(struct extent_map_tree *tree,
+ struct address_space *mapping,
+ get_extent_t *get_extent,
+ struct writeback_control *wbc)
+{
+ int ret = 0;
+ struct extent_page_data epd = {
+ .bio = NULL,
+ .tree = tree,
+ .get_extent = get_extent,
+ };
+
+ ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
+ if (epd.bio) {
+ submit_one_bio(WRITE, epd.bio);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(extent_writepages);
+
+int extent_readpages(struct extent_map_tree *tree,
+ struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages,
+ get_extent_t get_extent)
+{
+ struct bio *bio = NULL;
+ unsigned page_idx;
+ struct pagevec pvec;
+
+ pagevec_init(&pvec, 0);
+ for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+ struct page *page = list_entry(pages->prev, struct page, lru);
+
+ prefetchw(&page->flags);
+ list_del(&page->lru);
+ /*
+ * what we want to do here is call add_to_page_cache_lru,
+ * but that isn't exported, so we reproduce it here
+ */
+ if (!add_to_page_cache(page, mapping,
+ page->index, GFP_KERNEL)) {
+
+ /* open coding of lru_cache_add, also not exported */
+ page_cache_get(page);
+ if (!pagevec_add(&pvec, page))
+ __pagevec_lru_add(&pvec);
+ __extent_read_full_page(tree, page, get_extent, &bio);
+ }
+ page_cache_release(page);
+ }
+ if (pagevec_count(&pvec))
+ __pagevec_lru_add(&pvec);
+ BUG_ON(!list_empty(pages));
+ if (bio)
+ submit_one_bio(READ, bio);
+ return 0;
+}
+EXPORT_SYMBOL(extent_readpages);
+
/*
* basic invalidatepage code, this waits on any locked or writeback
* ranges corresponding to the page, and then deletes any extent state
int extent_invalidatepage(struct extent_map_tree *tree,
struct page *page, unsigned long offset)
{
- u64 start = (page->index << PAGE_CACHE_SHIFT);
+ u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
u64 end = start + PAGE_CACHE_SIZE - 1;
size_t blocksize = page->mapping->host->i_sb->s_blocksize;
struct inode *inode, struct page *page,
unsigned from, unsigned to, get_extent_t *get_extent)
{
- u64 page_start = page->index << PAGE_CACHE_SHIFT;
+ u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
u64 block_start;
u64 orig_block_start;
flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
}
- if (!isnew && !PageUptodate(page) &&
+ if ((em->block_start != EXTENT_MAP_HOLE &&
+ em->block_start != EXTENT_MAP_INLINE) &&
+ !isnew && !PageUptodate(page) &&
(block_off_end > to || block_off_start < from) &&
!test_range_bit(tree, block_start, cur_end,
EXTENT_UPTODATE, 1)) {
u64 extent_offset = block_start - em->start;
size_t iosize;
sector = (em->block_start + extent_offset) >> 9;
- iosize = (cur_end - block_start + blocksize - 1) &
+ iosize = (cur_end - block_start + blocksize) &
~((u64)blocksize - 1);
/*
* we've already got the extent locked, but we
EXTENT_LOCKED, 0, NULL, GFP_NOFS);
ret = submit_extent_page(READ, tree, page,
sector, iosize, page_offset, em->bdev,
+ NULL, 1,
end_bio_extent_preparewrite);
iocount++;
block_start = block_start + iosize;
int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page)
{
struct extent_map *em;
- u64 start = page->index << PAGE_CACHE_SHIFT;
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
u64 orig_start = start;
int ret = 1;
struct inode *inode = mapping->host;
u64 start = iblock << inode->i_blkbits;
u64 end = start + (1 << inode->i_blkbits) - 1;
+ sector_t sector = 0;
struct extent_map *em;
em = get_extent(inode, NULL, 0, start, end, 0);
if (em->block_start == EXTENT_MAP_INLINE ||
em->block_start == EXTENT_MAP_HOLE)
- return 0;
+ goto out;
- return (em->block_start + start - em->start) >> inode->i_blkbits;
+ sector = (em->block_start + start - em->start) >> inode->i_blkbits;
+out:
+ free_extent_map(em);
+ return sector;
}
static int add_lru(struct extent_map_tree *tree, struct extent_buffer *eb)
rm = list_entry(tree->buffer_lru.prev,
struct extent_buffer, lru);
tree->lru_size--;
- list_del(&rm->lru);
+ list_del_init(&rm->lru);
free_extent_buffer(rm);
}
} else
unsigned long i)
{
struct page *p;
+ struct address_space *mapping;
if (i == 0)
return eb->first_page;
i += eb->start >> PAGE_CACHE_SHIFT;
- p = find_get_page(eb->first_page->mapping, i);
- page_cache_release(p);
+ mapping = eb->first_page->mapping;
+ read_lock_irq(&mapping->tree_lock);
+ p = radix_tree_lookup(&mapping->page_tree, i);
+ read_unlock_irq(&mapping->tree_lock);
return p;
}
spin_lock(&tree->lru_lock);
eb = find_lru(tree, start, len);
- if (eb) {
- goto lru_add;
- }
spin_unlock(&tree->lru_lock);
-
if (eb) {
- memset(eb, 0, sizeof(*eb));
- } else {
- eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+ return eb;
}
+
+ eb = kmem_cache_zalloc(extent_buffer_cache, mask);
INIT_LIST_HEAD(&eb->lru);
eb->start = start;
eb->len = len;
atomic_set(&eb->refs, 1);
- spin_lock(&tree->lru_lock);
-lru_add:
- add_lru(tree, eb);
- spin_unlock(&tree->lru_lock);
return eb;
}
struct extent_buffer *eb;
struct page *p;
struct address_space *mapping = tree->mapping;
- int uptodate = 0;
+ int uptodate = 1;
eb = __alloc_extent_buffer(tree, start, len, mask);
if (!eb || IS_ERR(eb))
return NULL;
if (eb->flags & EXTENT_BUFFER_FILLED)
- return eb;
+ goto lru_add;
if (page0) {
eb->first_page = page0;
page_cache_get(page0);
mark_page_accessed(page0);
set_page_extent_mapped(page0);
+ WARN_ON(!PageUptodate(page0));
set_page_private(page0, EXTENT_PAGE_PRIVATE_FIRST_PAGE |
len << 2);
} else {
p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
if (!p) {
WARN_ON(1);
- /* make sure the free only frees the pages we've
- * grabbed a reference on
- */
- eb->len = i << PAGE_CACHE_SHIFT;
- eb->start &= ~((u64)PAGE_CACHE_SIZE - 1);
goto fail;
}
set_page_extent_mapped(p);
if (uptodate)
eb->flags |= EXTENT_UPTODATE;
eb->flags |= EXTENT_BUFFER_FILLED;
+
+lru_add:
+ spin_lock(&tree->lru_lock);
+ add_lru(tree, eb);
+ spin_unlock(&tree->lru_lock);
return eb;
+
fail:
- free_extent_buffer(eb);
+ spin_lock(&tree->lru_lock);
+ list_del_init(&eb->lru);
+ spin_unlock(&tree->lru_lock);
+ if (!atomic_dec_and_test(&eb->refs))
+ return NULL;
+ for (index = 1; index < i; index++) {
+ page_cache_release(extent_buffer_page(eb, index));
+ }
+ if (i > 0)
+ page_cache_release(extent_buffer_page(eb, 0));
+ __free_extent_buffer(eb);
return NULL;
}
EXPORT_SYMBOL(alloc_extent_buffer);
gfp_t mask)
{
unsigned long num_pages = num_extent_pages(start, len);
- unsigned long i; unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long i;
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
struct extent_buffer *eb;
struct page *p;
struct address_space *mapping = tree->mapping;
return NULL;
if (eb->flags & EXTENT_BUFFER_FILLED)
- return eb;
+ goto lru_add;
for (i = 0; i < num_pages; i++, index++) {
p = find_lock_page(mapping, index);
if (!p) {
- /* make sure the free only frees the pages we've
- * grabbed a reference on
- */
- eb->len = i << PAGE_CACHE_SHIFT;
- eb->start &= ~((u64)PAGE_CACHE_SIZE - 1);
goto fail;
}
set_page_extent_mapped(p);
if (uptodate)
eb->flags |= EXTENT_UPTODATE;
eb->flags |= EXTENT_BUFFER_FILLED;
+
+lru_add:
+ spin_lock(&tree->lru_lock);
+ add_lru(tree, eb);
+ spin_unlock(&tree->lru_lock);
return eb;
fail:
- free_extent_buffer(eb);
+ spin_lock(&tree->lru_lock);
+ list_del_init(&eb->lru);
+ spin_unlock(&tree->lru_lock);
+ if (!atomic_dec_and_test(&eb->refs))
+ return NULL;
+ for (index = 1; index < i; index++) {
+ page_cache_release(extent_buffer_page(eb, index));
+ }
+ if (i > 0)
+ page_cache_release(extent_buffer_page(eb, 0));
+ __free_extent_buffer(eb);
return NULL;
}
EXPORT_SYMBOL(find_extent_buffer);
if (!atomic_dec_and_test(&eb->refs))
return;
+ WARN_ON(!list_empty(&eb->lru));
num_pages = num_extent_pages(eb->start, eb->len);
- for (i = 0; i < num_pages; i++) {
+ for (i = 1; i < num_pages; i++) {
page_cache_release(extent_buffer_page(eb, i));
}
+ page_cache_release(extent_buffer_page(eb, 0));
__free_extent_buffer(eb);
}
EXPORT_SYMBOL(free_extent_buffer);
*/
if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
((i == num_pages - 1) &&
- ((eb->start + eb->len - 1) & (PAGE_CACHE_SIZE - 1)))) {
- start = page->index << PAGE_CACHE_SHIFT;
+ ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
+ start = (u64)page->index << PAGE_CACHE_SHIFT;
end = start + PAGE_CACHE_SIZE - 1;
if (test_range_bit(tree, start, end,
EXTENT_DIRTY, 0)) {
}
}
clear_page_dirty_for_io(page);
+ write_lock_irq(&page->mapping->tree_lock);
+ if (!PageDirty(page)) {
+ radix_tree_tag_clear(&page->mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_DIRTY);
+ }
+ write_unlock_irq(&page->mapping->tree_lock);
unlock_page(page);
}
return 0;
page = extent_buffer_page(eb, i);
if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
((i == num_pages - 1) &&
- ((eb->start + eb->len - 1) & (PAGE_CACHE_SIZE - 1)))) {
+ ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
check_page_uptodate(tree, page);
continue;
}
EXTENT_UPTODATE, 1)) {
return 0;
}
+
if (start) {
WARN_ON(start < eb->start);
start_i = (start >> PAGE_CACHE_SHIFT) -
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
- if (i == 0)
- offset += start_offset;
+ offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
while(len > 0) {
page = extent_buffer_page(eb, i);
WARN_ON(!PageUptodate(page));
cur = min(len, (PAGE_CACHE_SIZE - offset));
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page, KM_USER1);
memcpy(dst, kaddr + offset, cur);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr, KM_USER1);
dst += cur;
len -= cur;
struct page *p;
size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
- unsigned long end_i = (start_offset + start + min_len) >>
+ unsigned long end_i = (start_offset + start + min_len - 1) >>
PAGE_CACHE_SHIFT;
if (i != end_i)
*map_start = 0;
} else {
offset = 0;
- *map_start = (i << PAGE_CACHE_SHIFT) - start_offset;
+ *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
}
- if (start + min_len >= eb->len) {
+ if (start + min_len > eb->len) {
printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
WARN_ON(1);
}
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
- if (i == 0)
- offset += start_offset;
+ offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
while(len > 0) {
page = extent_buffer_page(eb, i);
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
- if (i == 0)
- offset += start_offset;
+ offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
while(len > 0) {
page = extent_buffer_page(eb, i);
WARN_ON(!PageUptodate(page));
cur = min(len, PAGE_CACHE_SIZE - offset);
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page, KM_USER1);
memcpy(kaddr + offset, src, cur);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr, KM_USER1);
src += cur;
len -= cur;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
- if (i == 0)
- offset += start_offset;
+ offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
while(len > 0) {
page = extent_buffer_page(eb, i);
WARN_ON(src->len != dst_len);
- offset = dst_offset & ((unsigned long)PAGE_CACHE_SIZE - 1);
- if (i == 0)
- offset += start_offset;
+ offset = (start_offset + dst_offset) &
+ ((unsigned long)PAGE_CACHE_SIZE - 1);
while(len > 0) {
page = extent_buffer_page(dst, i);
cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
- kaddr = kmap_atomic(page, KM_USER1);
+ kaddr = kmap_atomic(page, KM_USER0);
read_extent_buffer(src, kaddr + offset, src_offset, cur);
- kunmap_atomic(kaddr, KM_USER1);
+ kunmap_atomic(kaddr, KM_USER0);
src_offset += cur;
len -= cur;
}
while(len > 0) {
- dst_off_in_page = dst_offset &
+ dst_off_in_page = (start_offset + dst_offset) &
((unsigned long)PAGE_CACHE_SIZE - 1);
- src_off_in_page = src_offset &
+ src_off_in_page = (start_offset + src_offset) &
((unsigned long)PAGE_CACHE_SIZE - 1);
dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
- if (src_i == 0)
- src_off_in_page += start_offset;
- if (dst_i == 0)
- dst_off_in_page += start_offset;
-
cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
src_off_in_page));
cur = min_t(unsigned long, cur,
dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
- dst_off_in_page = dst_end &
+ dst_off_in_page = (start_offset + dst_end) &
((unsigned long)PAGE_CACHE_SIZE - 1);
- src_off_in_page = src_end &
+ src_off_in_page = (start_offset + src_end) &
((unsigned long)PAGE_CACHE_SIZE - 1);
- if (src_i == 0)
- src_off_in_page += start_offset;
- if (dst_i == 0)
- dst_off_in_page += start_offset;
cur = min_t(unsigned long, len, src_off_in_page + 1);
cur = min(cur, dst_off_in_page + 1);