X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=fs%2Fbtrfs%2Fordered-data.c;h=77c2411a5f0f0c59b6ab97e78b2828d94a1c7401;hb=46939f8b15e44f065d052e89ea4f2adc81fdc740;hp=6513270f054c538832e93c7e22dd12e20ee0e589;hpb=e6dcd2dc9c489108648e2ed543315dd134d50a9a;p=safe%2Fjmp%2Flinux-2.6 diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 6513270..77c2411 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -19,12 +19,13 @@ #include #include #include +#include +#include #include "ctree.h" #include "transaction.h" #include "btrfs_inode.h" #include "extent_io.h" - static u64 entry_end(struct btrfs_ordered_extent *entry) { if (entry->file_offset + entry->len < entry->file_offset) @@ -32,14 +33,17 @@ static u64 entry_end(struct btrfs_ordered_extent *entry) return entry->file_offset + entry->len; } +/* returns NULL if the insertion worked, or it returns the node it did find + * in the tree + */ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, struct rb_node *node) { - struct rb_node ** p = &root->rb_node; - struct rb_node * parent = NULL; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; struct btrfs_ordered_extent *entry; - while(*p) { + while (*p) { parent = *p; entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); @@ -56,16 +60,20 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, return NULL; } +/* + * look for a given offset in the tree, and if it can't be found return the + * first lesser offset + */ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, struct rb_node **prev_ret) { - struct rb_node * n = root->rb_node; + struct rb_node *n = root->rb_node; struct rb_node *prev = NULL; struct rb_node *test; struct btrfs_ordered_extent *entry; struct btrfs_ordered_extent *prev_entry = NULL; - while(n) { + while (n) { entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); prev = n; prev_entry = entry; @@ -80,7 +88,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, if (!prev_ret) return NULL; - while(prev && file_offset >= entry_end(prev_entry)) { + while (prev && file_offset >= entry_end(prev_entry)) { test = rb_next(prev); if (!test) break; @@ -94,7 +102,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, if (prev) prev_entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node); - while(prev && file_offset < entry_end(prev_entry)) { + while (prev && file_offset < entry_end(prev_entry)) { test = rb_prev(prev); if (!test) break; @@ -106,6 +114,9 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, return NULL; } +/* + * helper to check if a given offset is inside a given entry + */ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) { if (file_offset < entry->file_offset || @@ -114,6 +125,10 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) return 1; } +/* + * look find the first ordered struct that has this offset, otherwise + * the first one less than this offset + */ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, u64 file_offset) { @@ -136,8 +151,21 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, return ret; } +/* allocate and add a new ordered_extent into the per-inode tree. + * file_offset is the logical offset in the file + * + * start is the disk block number of an extent already reserved in the + * extent allocation tree + * + * len is the length of the extent + * + * This also sets the EXTENT_ORDERED bit on the range in the inode. + * + * The tree is given a single reference on the ordered extent that was + * inserted. + */ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len) + u64 start, u64 len, u64 disk_len, int type) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; @@ -152,59 +180,61 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->file_offset = file_offset; entry->start = start; entry->len = len; + entry->disk_len = disk_len; entry->inode = inode; + if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) + set_bit(type, &entry->flags); + /* one ref for the tree */ atomic_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); + INIT_LIST_HEAD(&entry->root_extent_list); node = tree_insert(&tree->tree, file_offset, &entry->rb_node); - if (node) { - entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - atomic_inc(&entry->refs); - } + BUG_ON(node); + set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset, entry_end(entry) - 1, GFP_NOFS); - set_bit(BTRFS_ORDERED_START, &entry->flags); + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + list_add_tail(&entry->root_extent_list, + &BTRFS_I(inode)->root->fs_info->ordered_extents); + spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + mutex_unlock(&tree->mutex); BUG_ON(node); return 0; } -int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum) +/* + * Add a struct btrfs_ordered_sum into the list of checksums to be inserted + * when an ordered extent is finished. If the list covers more than one + * ordered extent, it is split across multiples. + */ +int btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum) { struct btrfs_ordered_inode_tree *tree; - struct rb_node *node; - struct btrfs_ordered_extent *entry; tree = &BTRFS_I(inode)->ordered_tree; mutex_lock(&tree->mutex); - node = tree_search(tree, sum->file_offset); - if (!node) { -search_fail: -printk("add ordered sum failed to find a node for inode %lu offset %Lu\n", inode->i_ino, sum->file_offset); - node = rb_first(&tree->tree); - while(node) { - entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - printk("entry %Lu %Lu %Lu\n", entry->file_offset, entry->file_offset + entry->len, entry->start); - node = rb_next(node); - } - BUG(); - } - BUG_ON(!node); - - entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (!offset_in_entry(entry, sum->file_offset)) { - goto search_fail; - } - list_add_tail(&sum->list, &entry->list); mutex_unlock(&tree->mutex); return 0; } +/* + * this is used to account for finished IO across a given range + * of the file. The IO should not span ordered extents. If + * a given ordered_extent is completely done, 1 is returned, otherwise + * 0. + * + * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used + * to make sure this function only returns 1 once for a given ordered extent. + */ int btrfs_dec_test_ordered_pending(struct inode *inode, u64 file_offset, u64 io_size) { @@ -233,9 +263,6 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, ret = test_range_bit(io_tree, entry->file_offset, entry->file_offset + entry->len - 1, EXTENT_ORDERED, 0); - if (!test_bit(BTRFS_ORDERED_START, &entry->flags)) { -printk("inode %lu not ready yet for extent %Lu %Lu\n", inode->i_ino, entry->file_offset, entry_end(entry)); - } if (ret == 0) ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); out: @@ -243,13 +270,31 @@ out: return ret == 0; } +/* + * used to drop a reference on an ordered extent. This will free + * the extent if the last reference is dropped + */ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) { - if (atomic_dec_and_test(&entry->refs)) + struct list_head *cur; + struct btrfs_ordered_sum *sum; + + if (atomic_dec_and_test(&entry->refs)) { + while (!list_empty(&entry->list)) { + cur = entry->list.next; + sum = list_entry(cur, struct btrfs_ordered_sum, list); + list_del(&sum->list); + kfree(sum); + } kfree(entry); + } return 0; } +/* + * remove an ordered extent from the tree. No references are dropped + * but, anyone waiting on this extent is woken up. + */ int btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry) { @@ -262,62 +307,134 @@ int btrfs_remove_ordered_extent(struct inode *inode, rb_erase(node, &tree->tree); tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + list_del_init(&entry->root_extent_list); + spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + mutex_unlock(&tree->mutex); wake_up(&entry->wait); return 0; } -void btrfs_wait_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) +/* + * wait for all the ordered extents in a root. This is done when balancing + * space between drives. + */ +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) { - u64 start = entry->file_offset; - u64 end = start + entry->len - 1; -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) - do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE); -#else - do_sync_mapping_range(inode->i_mapping, start, end, - SYNC_FILE_RANGE_WRITE); -#endif - wait_event(entry->wait, - test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); + struct list_head splice; + struct list_head *cur; + struct btrfs_ordered_extent *ordered; + struct inode *inode; + + INIT_LIST_HEAD(&splice); + + spin_lock(&root->fs_info->ordered_extent_lock); + list_splice_init(&root->fs_info->ordered_extents, &splice); + while (!list_empty(&splice)) { + cur = splice.next; + ordered = list_entry(cur, struct btrfs_ordered_extent, + root_extent_list); + if (nocow_only && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { + list_move(&ordered->root_extent_list, + &root->fs_info->ordered_extents); + cond_resched_lock(&root->fs_info->ordered_extent_lock); + continue; + } + + list_del_init(&ordered->root_extent_list); + atomic_inc(&ordered->refs); + + /* + * the inode may be getting freed (in sys_unlink path). + */ + inode = igrab(ordered->inode); + + spin_unlock(&root->fs_info->ordered_extent_lock); + + if (inode) { + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + iput(inode); + } else { + btrfs_put_ordered_extent(ordered); + } + + spin_lock(&root->fs_info->ordered_extent_lock); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + return 0; } -static void btrfs_start_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry, int wait) +/* + * Used to start IO or wait for a given ordered extent to finish. + * + * If wait is one, this effectively waits on page writeback for all the pages + * in the extent, and it waits on the io completion code to insert + * metadata into the btree corresponding to the extent + */ +void btrfs_start_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry, + int wait) { u64 start = entry->file_offset; u64 end = start + entry->len - 1; -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) - do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE); -#else - do_sync_mapping_range(inode->i_mapping, start, end, - SYNC_FILE_RANGE_WRITE); -#endif - if (wait) + /* + * pages in the range can be dirty, clean or writeback. We + * start IO on any dirty ones so the wait doesn't stall waiting + * for pdflush to find them + */ + btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL); + if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); + } } -void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) +/* + * Used to wait on ordered extents across a large range of bytes. + */ +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) { u64 end; + u64 orig_end; + u64 wait_end; struct btrfs_ordered_extent *ordered; - int found; - int should_wait = 0; + if (start + len < start) { + orig_end = INT_LIMIT(loff_t); + } else { + orig_end = start + len - 1; + if (orig_end > INT_LIMIT(loff_t)) + orig_end = INT_LIMIT(loff_t); + } + wait_end = orig_end; again: - if (start + len < start) - end = (u64)-1; - else - end = start + len - 1; - found = 0; - while(1) { + /* start IO across the range first to instantiate any delalloc + * extents + */ + btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); + + /* The compression code will leave pages locked but return from + * writepage without setting the page writeback. Starting again + * with WB_SYNC_ALL will end up waiting for the IO to actually start. + */ + btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); + + btrfs_wait_on_page_writeback_range(inode->i_mapping, + start >> PAGE_CACHE_SHIFT, + orig_end >> PAGE_CACHE_SHIFT); + + end = orig_end; + while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, end); - if (!ordered) { + if (!ordered) break; - } - if (ordered->file_offset >= start + len) { + if (ordered->file_offset > orig_end) { btrfs_put_ordered_extent(ordered); break; } @@ -325,45 +442,25 @@ again: btrfs_put_ordered_extent(ordered); break; } - btrfs_start_ordered_extent(inode, ordered, should_wait); - found++; + btrfs_start_ordered_extent(inode, ordered, 1); end = ordered->file_offset; btrfs_put_ordered_extent(ordered); - if (end == 0) + if (end == 0 || end == start) break; end--; } - if (should_wait && found) { - should_wait = 0; + if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, + EXTENT_ORDERED | EXTENT_DELALLOC, 0)) { + schedule_timeout(1); goto again; } -} - -int btrfs_add_ordered_pending(struct inode *inode, - struct btrfs_ordered_extent *ordered, - u64 start, u64 len) -{ - WARN_ON(1); return 0; -#if 0 - int ret; - struct btrfs_ordered_inode_tree *tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - - tree = &BTRFS_I(inode)->ordered_tree; - mutex_lock(&tree->mutex); - if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) { - ret = -EAGAIN; - goto out; - } - set_extent_ordered(io_tree, start, start + len - 1, GFP_NOFS); - ret = 0; -out: - mutex_unlock(&tree->mutex); - return ret; -#endif } +/* + * find an ordered extent corresponding to file_offset. return NULL if + * nothing is found, otherwise take a reference on the extent and return it + */ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, u64 file_offset) { @@ -387,8 +484,12 @@ out: return entry; } +/* + * lookup and return any extent before 'file_offset'. NULL is returned + * if none is found + */ struct btrfs_ordered_extent * -btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset) +btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; @@ -406,3 +507,222 @@ out: mutex_unlock(&tree->mutex); return entry; } + +/* + * After an extent is done, call this to conditionally update the on disk + * i_size. i_size is updated to cover any fully written part of the file. + */ +int btrfs_ordered_update_i_size(struct inode *inode, + struct btrfs_ordered_extent *ordered) +{ + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + u64 disk_i_size; + u64 new_i_size; + u64 i_size_test; + struct rb_node *node; + struct btrfs_ordered_extent *test; + + mutex_lock(&tree->mutex); + disk_i_size = BTRFS_I(inode)->disk_i_size; + + /* + * if the disk i_size is already at the inode->i_size, or + * this ordered extent is inside the disk i_size, we're done + */ + if (disk_i_size >= inode->i_size || + ordered->file_offset + ordered->len <= disk_i_size) { + goto out; + } + + /* + * we can't update the disk_isize if there are delalloc bytes + * between disk_i_size and this ordered extent + */ + if (test_range_bit(io_tree, disk_i_size, + ordered->file_offset + ordered->len - 1, + EXTENT_DELALLOC, 0)) { + goto out; + } + /* + * walk backward from this ordered extent to disk_i_size. + * if we find an ordered extent then we can't update disk i_size + * yet + */ + node = &ordered->rb_node; + while (1) { + node = rb_prev(node); + if (!node) + break; + test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (test->file_offset + test->len <= disk_i_size) + break; + if (test->file_offset >= inode->i_size) + break; + if (test->file_offset >= disk_i_size) + goto out; + } + new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode)); + + /* + * at this point, we know we can safely update i_size to at least + * the offset from this ordered extent. But, we need to + * walk forward and see if ios from higher up in the file have + * finished. + */ + node = rb_next(&ordered->rb_node); + i_size_test = 0; + if (node) { + /* + * do we have an area where IO might have finished + * between our ordered extent and the next one. + */ + test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (test->file_offset > entry_end(ordered)) + i_size_test = test->file_offset; + } else { + i_size_test = i_size_read(inode); + } + + /* + * i_size_test is the end of a region after this ordered + * extent where there are no ordered extents. As long as there + * are no delalloc bytes in this area, it is safe to update + * disk_i_size to the end of the region. + */ + if (i_size_test > entry_end(ordered) && + !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, + EXTENT_DELALLOC, 0)) { + new_i_size = min_t(u64, i_size_test, i_size_read(inode)); + } + BTRFS_I(inode)->disk_i_size = new_i_size; +out: + mutex_unlock(&tree->mutex); + return 0; +} + +/* + * search the ordered extents for one corresponding to 'offset' and + * try to find a checksum. This is used because we allow pages to + * be reclaimed before their checksum is actually put into the btree + */ +int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, + u32 *sum) +{ + struct btrfs_ordered_sum *ordered_sum; + struct btrfs_sector_sum *sector_sums; + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + unsigned long num_sectors; + unsigned long i; + u32 sectorsize = BTRFS_I(inode)->root->sectorsize; + int ret = 1; + + ordered = btrfs_lookup_ordered_extent(inode, offset); + if (!ordered) + return 1; + + mutex_lock(&tree->mutex); + list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { + if (disk_bytenr >= ordered_sum->bytenr) { + num_sectors = ordered_sum->len / sectorsize; + sector_sums = ordered_sum->sums; + for (i = 0; i < num_sectors; i++) { + if (sector_sums[i].bytenr == disk_bytenr) { + *sum = sector_sums[i].sum; + ret = 0; + goto out; + } + } + } + } +out: + mutex_unlock(&tree->mutex); + btrfs_put_ordered_extent(ordered); + return ret; +} + + +/** + * taken from mm/filemap.c because it isn't exported + * + * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range + * @mapping: address space structure to write + * @start: offset in bytes where the range starts + * @end: offset in bytes where the range ends (inclusive) + * @sync_mode: enable synchronous operation + * + * Start writeback against all of a mapping's dirty pages that lie + * within the byte offsets inclusive. + * + * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as + * opposed to a regular memory cleansing writeback. The difference between + * these two operations is that if a dirty page/buffer is encountered, it must + * be waited upon, and not just skipped over. + */ +int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end, int sync_mode) +{ + struct writeback_control wbc = { + .sync_mode = sync_mode, + .nr_to_write = mapping->nrpages * 2, + .range_start = start, + .range_end = end, + .for_writepages = 1, + }; + return btrfs_writepages(mapping, &wbc); +} + +/** + * taken from mm/filemap.c because it isn't exported + * + * wait_on_page_writeback_range - wait for writeback to complete + * @mapping: target address_space + * @start: beginning page index + * @end: ending page index + * + * Wait for writeback to complete against pages indexed by start->end + * inclusive + */ +int btrfs_wait_on_page_writeback_range(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + struct pagevec pvec; + int nr_pages; + int ret = 0; + pgoff_t index; + + if (end < start) + return 0; + + pagevec_init(&pvec, 0); + index = start; + while ((index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_WRITEBACK, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { + unsigned i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* until radix tree lookup accepts end_index */ + if (page->index > end) + continue; + + wait_on_page_writeback(page); + if (PageError(page)) + ret = -EIO; + } + pagevec_release(&pvec); + cond_resched(); + } + + /* Check for outstanding write errors */ + if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + ret = -ENOSPC; + if (test_and_clear_bit(AS_EIO, &mapping->flags)) + ret = -EIO; + + return ret; +}