* Boston, MA 021110-1307, USA.
*/
-#include <linux/buffer_head.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
-#include <linux/smp_lock.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/statfs.h>
#include <linux/compat.h>
-#include <linux/version.h>
+#include <linux/slab.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "ioctl.h"
#include "print-tree.h"
+#include "tree-log.h"
+#include "locking.h"
+#include "compat.h"
-static int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes,
- struct page **prepared_pages,
- const char __user * buf)
+/* simple helper to fault in pages and copy. This should go away
+ * and be replaced with calls into generic code.
+ */
+static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
+ int write_bytes,
+ struct page **prepared_pages,
+ struct iov_iter *i)
{
- long page_fault = 0;
- int i;
+ size_t copied;
+ int pg = 0;
int offset = pos & (PAGE_CACHE_SIZE - 1);
- for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
+ while (write_bytes > 0) {
size_t count = min_t(size_t,
PAGE_CACHE_SIZE - offset, write_bytes);
- struct page *page = prepared_pages[i];
- fault_in_pages_readable(buf, count);
+ struct page *page = prepared_pages[pg];
+again:
+ if (unlikely(iov_iter_fault_in_readable(i, count)))
+ return -EFAULT;
/* Copy data from userspace to the current page */
- kmap(page);
- page_fault = __copy_from_user(page_address(page) + offset,
- buf, count);
+ copied = iov_iter_copy_from_user(page, i, offset, count);
+
/* Flush processor's dcache for this page */
flush_dcache_page(page);
- kunmap(page);
- buf += count;
- write_bytes -= count;
+ iov_iter_advance(i, copied);
+ write_bytes -= copied;
- if (page_fault)
- break;
+ if (unlikely(copied == 0)) {
+ count = min_t(size_t, PAGE_CACHE_SIZE - offset,
+ iov_iter_single_seg_count(i));
+ goto again;
+ }
+
+ if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
+ offset += copied;
+ } else {
+ pg++;
+ offset = 0;
+ }
}
- return page_fault ? -EFAULT : 0;
+ return 0;
}
-static void btrfs_drop_pages(struct page **pages, size_t num_pages)
+/*
+ * unlocks pages after btrfs_file_write is done with them
+ */
+static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
{
size_t i;
for (i = 0; i < num_pages; i++) {
if (!pages[i])
break;
+ /* page checked is some magic around finding pages that
+ * have been modified without going through btrfs_set_page_dirty
+ * clear it here
+ */
+ ClearPageChecked(pages[i]);
unlock_page(pages[i]);
mark_page_accessed(pages[i]);
page_cache_release(pages[i]);
}
}
-static int insert_inline_extent(struct btrfs_root *root, struct inode *inode,
- u64 offset, ssize_t size,
- struct buffer_head *bh)
+/*
+ * after copy_from_user, pages need to be dirtied and we need to make
+ * sure holes are created between the current EOF and the start of
+ * any next extents (if required).
+ *
+ * this also makes the decision about creating an inline extent vs
+ * doing real data extents, marking pages dirty and delalloc as required.
+ */
+static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct file *file,
+ struct page **pages,
+ size_t num_pages,
+ loff_t pos,
+ size_t write_bytes)
{
- struct btrfs_key key;
- struct btrfs_path *path;
- char *ptr, *kaddr;
- struct btrfs_trans_handle *trans;
- struct btrfs_file_extent_item *ei;
- u32 datasize;
int err = 0;
- int ret;
+ int i;
+ struct inode *inode = fdentry(file)->d_inode;
+ u64 num_bytes;
+ u64 start_pos;
+ u64 end_of_last_block;
+ u64 end_pos = pos + write_bytes;
+ loff_t isize = i_size_read(inode);
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ start_pos = pos & ~((u64)root->sectorsize - 1);
+ num_bytes = (write_bytes + pos - start_pos +
+ root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
- mutex_lock(&root->fs_info->fs_mutex);
- trans = btrfs_start_transaction(root, 1);
- btrfs_set_trans_block_group(trans, inode);
+ end_of_last_block = start_pos + num_bytes - 1;
+ err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
+ NULL);
+ BUG_ON(err);
- key.objectid = inode->i_ino;
- key.offset = offset;
- key.flags = 0;
- btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
- BUG_ON(size >= PAGE_CACHE_SIZE);
- datasize = btrfs_file_extent_calc_inline_size(size);
-
- ret = btrfs_insert_empty_item(trans, root, path, &key,
- datasize);
- if (ret) {
- err = ret;
- goto fail;
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = pages[i];
+ SetPageUptodate(p);
+ ClearPageChecked(p);
+ set_page_dirty(p);
}
- ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
- path->slots[0], struct btrfs_file_extent_item);
- btrfs_set_file_extent_generation(ei, trans->transid);
- btrfs_set_file_extent_type(ei,
- BTRFS_FILE_EXTENT_INLINE);
- ptr = btrfs_file_extent_inline_start(ei);
-
- kaddr = kmap_atomic(bh->b_page, KM_USER0);
- btrfs_memcpy(root, path->nodes[0]->b_data,
- ptr, kaddr + bh_offset(bh),
- size);
- kunmap_atomic(kaddr, KM_USER0);
- btrfs_mark_buffer_dirty(path->nodes[0]);
-fail:
- btrfs_free_path(path);
- ret = btrfs_end_transaction(trans, root);
- if (ret && !err)
- err = ret;
- mutex_unlock(&root->fs_info->fs_mutex);
- return err;
+ if (end_pos > isize) {
+ i_size_write(inode, end_pos);
+ /* we've only changed i_size in ram, and we haven't updated
+ * the disk i_size. There is no need to log the inode
+ * at this time.
+ */
+ }
+ return 0;
}
-static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct file *file,
- struct page **pages,
- size_t num_pages,
- loff_t pos,
- size_t write_bytes)
+/*
+ * this drops all the extents in the cache that intersect the range
+ * [start, end]. Existing extents are split as required.
+ */
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ int skip_pinned)
{
- int i;
- int offset;
- int err = 0;
+ struct extent_map *em;
+ struct extent_map *split = NULL;
+ struct extent_map *split2 = NULL;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ u64 len = end - start + 1;
int ret;
- int this_write;
- struct inode *inode = file->f_path.dentry->d_inode;
- struct buffer_head *bh;
+ int testend = 1;
+ unsigned long flags;
+ int compressed = 0;
- for (i = 0; i < num_pages; i++) {
- offset = pos & (PAGE_CACHE_SIZE -1);
- this_write = min((size_t)PAGE_CACHE_SIZE - offset, write_bytes);
-
- /* FIXME, one block at a time */
- bh = page_buffers(pages[i]);
-
- if (buffer_mapped(bh) && bh->b_blocknr == 0) {
- ret = insert_inline_extent(root, inode,
- pages[i]->index << PAGE_CACHE_SHIFT,
- offset + this_write, bh);
- if (ret) {
- err = ret;
- goto failed;
+ WARN_ON(end < start);
+ if (end == (u64)-1) {
+ len = (u64)-1;
+ testend = 0;
+ }
+ while (1) {
+ if (!split)
+ split = alloc_extent_map(GFP_NOFS);
+ if (!split2)
+ split2 = alloc_extent_map(GFP_NOFS);
+
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (!em) {
+ write_unlock(&em_tree->lock);
+ break;
+ }
+ flags = em->flags;
+ if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+ if (testend && em->start + em->len >= start + len) {
+ free_extent_map(em);
+ write_unlock(&em_tree->lock);
+ break;
}
+ start = em->start + em->len;
+ if (testend)
+ len = start + len - (em->start + em->len);
+ free_extent_map(em);
+ write_unlock(&em_tree->lock);
+ continue;
}
+ compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ remove_extent_mapping(em_tree, em);
- ret = btrfs_commit_write(file, pages[i], offset,
- offset + this_write);
- pos += this_write;
- if (ret) {
- err = ret;
- goto failed;
+ if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+ em->start < start) {
+ split->start = em->start;
+ split->len = start - em->start;
+ split->orig_start = em->orig_start;
+ split->block_start = em->block_start;
+
+ if (compressed)
+ split->block_len = em->block_len;
+ else
+ split->block_len = split->len;
+
+ split->bdev = em->bdev;
+ split->flags = flags;
+ ret = add_extent_mapping(em_tree, split);
+ BUG_ON(ret);
+ free_extent_map(split);
+ split = split2;
+ split2 = NULL;
}
- WARN_ON(this_write > write_bytes);
- write_bytes -= this_write;
+ if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+ testend && em->start + em->len > start + len) {
+ u64 diff = start + len - em->start;
+
+ split->start = start + len;
+ split->len = em->start + em->len - (start + len);
+ split->bdev = em->bdev;
+ split->flags = flags;
+
+ if (compressed) {
+ split->block_len = em->block_len;
+ split->block_start = em->block_start;
+ split->orig_start = em->orig_start;
+ } else {
+ split->block_len = split->len;
+ split->block_start = em->block_start + diff;
+ split->orig_start = split->start;
+ }
+
+ ret = add_extent_mapping(em_tree, split);
+ BUG_ON(ret);
+ free_extent_map(split);
+ split = NULL;
+ }
+ write_unlock(&em_tree->lock);
+
+ /* once for us */
+ free_extent_map(em);
+ /* once for the tree*/
+ free_extent_map(em);
}
-failed:
- return err;
+ if (split)
+ free_extent_map(split);
+ if (split2)
+ free_extent_map(split2);
+ return 0;
}
/*
* it is either truncated or split. Anything entirely inside the range
* is deleted from the tree.
*/
-int btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- u64 start, u64 end, u64 *hint_block)
+int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
+ u64 start, u64 end, u64 *hint_byte, int drop_cache)
{
- int ret;
- struct btrfs_key key;
- struct btrfs_leaf *leaf;
- int slot;
- struct btrfs_file_extent_item *extent;
- u64 extent_end = 0;
- int keep;
- struct btrfs_file_extent_item old;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key new_key;
u64 search_start = start;
- int bookend;
- int found_type;
- int found_extent;
- int found_inline;
+ u64 disk_bytenr = 0;
+ u64 num_bytes = 0;
+ u64 extent_offset = 0;
+ u64 extent_end = 0;
+ int del_nr = 0;
+ int del_slot = 0;
+ int extent_type;
int recow;
+ int ret;
+
+ if (drop_cache)
+ btrfs_drop_extent_cache(inode, start, end - 1, 0);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- while(1) {
+
+ while (1) {
recow = 0;
- btrfs_release_path(root, path);
ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
search_start, -1);
if (ret < 0)
- goto out;
- if (ret > 0) {
- if (path->slots[0] == 0) {
+ break;
+ if (ret > 0 && path->slots[0] > 0 && search_start == start) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+ if (key.objectid == inode->i_ino &&
+ key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+ ret = 0;
+next_slot:
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ BUG_ON(del_nr > 0);
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
ret = 0;
- goto out;
+ break;
}
- path->slots[0]--;
+ leaf = path->nodes[0];
+ recow = 1;
}
-next_slot:
- keep = 0;
- bookend = 0;
- found_extent = 0;
- found_inline = 0;
- extent = NULL;
- leaf = btrfs_buffer_leaf(path->nodes[0]);
- slot = path->slots[0];
- ret = 0;
- btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
- if (key.offset >= end || key.objectid != inode->i_ino) {
- goto out;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid > inode->i_ino ||
+ key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+ break;
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ extent_offset = btrfs_file_extent_offset(leaf, fi);
+ extent_end = key.offset +
+ btrfs_file_extent_num_bytes(leaf, fi);
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ extent_end = key.offset +
+ btrfs_file_extent_inline_len(leaf, fi);
+ } else {
+ WARN_ON(1);
+ extent_end = search_start;
}
- if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY) {
- goto out;
+
+ if (extent_end <= search_start) {
+ path->slots[0]++;
+ goto next_slot;
}
+
+ search_start = max(key.offset, start);
if (recow) {
- search_start = key.offset;
+ btrfs_release_path(root, path);
continue;
}
- if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
- extent = btrfs_item_ptr(leaf, slot,
- struct btrfs_file_extent_item);
- found_type = btrfs_file_extent_type(extent);
- if (found_type == BTRFS_FILE_EXTENT_REG) {
- extent_end = key.offset +
- (btrfs_file_extent_num_blocks(extent) <<
- inode->i_blkbits);
- found_extent = 1;
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- found_inline = 1;
- extent_end = key.offset +
- btrfs_file_extent_inline_len(leaf->items +
- slot);
+
+ /*
+ * | - range to drop - |
+ * | -------- extent -------- |
+ */
+ if (start > key.offset && end < extent_end) {
+ BUG_ON(del_nr > 0);
+ BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.offset = start;
+ ret = btrfs_duplicate_item(trans, root, path,
+ &new_key);
+ if (ret == -EAGAIN) {
+ btrfs_release_path(root, path);
+ continue;
}
- } else {
- extent_end = search_start;
- }
+ if (ret < 0)
+ break;
- /* we found nothing we can drop */
- if ((!found_extent && !found_inline) ||
- search_start >= extent_end) {
- int nextret;
- u32 nritems;
- nritems = btrfs_header_nritems(
- btrfs_buffer_header(path->nodes[0]));
- if (slot >= nritems - 1) {
- nextret = btrfs_next_leaf(root, path);
- if (nextret)
- goto out;
- recow = 1;
- } else {
- path->slots[0]++;
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ start - key.offset);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ extent_offset += start - key.offset;
+ btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - start);
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (disk_bytenr > 0) {
+ ret = btrfs_inc_extent_ref(trans, root,
+ disk_bytenr, num_bytes, 0,
+ root->root_key.objectid,
+ new_key.objectid,
+ start - extent_offset);
+ BUG_ON(ret);
+ *hint_byte = disk_bytenr;
}
- goto next_slot;
+ key.offset = start;
}
+ /*
+ * | ---- range to drop ----- |
+ * | -------- extent -------- |
+ */
+ if (start <= key.offset && end < extent_end) {
+ BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.offset = end;
+ btrfs_set_item_key_safe(trans, root, path, &new_key);
- /* FIXME, there's only one inline extent allowed right now */
- if (found_inline) {
- u64 mask = root->blocksize - 1;
- search_start = (extent_end + mask) & ~mask;
- } else
- search_start = extent_end;
-
- if (end < extent_end && end >= key.offset) {
- if (found_extent) {
- u64 disk_blocknr =
- btrfs_file_extent_disk_blocknr(extent);
- u64 disk_num_blocks =
- btrfs_file_extent_disk_num_blocks(extent);
- memcpy(&old, extent, sizeof(old));
- if (disk_blocknr != 0) {
- ret = btrfs_inc_extent_ref(trans, root,
- disk_blocknr, disk_num_blocks);
- BUG_ON(ret);
- }
+ extent_offset += end - key.offset;
+ btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - end);
+ btrfs_mark_buffer_dirty(leaf);
+ if (disk_bytenr > 0) {
+ inode_sub_bytes(inode, end - key.offset);
+ *hint_byte = disk_bytenr;
}
- WARN_ON(found_inline);
- bookend = 1;
- }
- /* truncate existing extent */
- if (start > key.offset) {
- u64 new_num;
- u64 old_num;
- keep = 1;
- WARN_ON(start & (root->blocksize - 1));
- if (found_extent) {
- new_num = (start - key.offset) >>
- inode->i_blkbits;
- old_num = btrfs_file_extent_num_blocks(extent);
- *hint_block =
- btrfs_file_extent_disk_blocknr(extent);
- if (btrfs_file_extent_disk_blocknr(extent)) {
- inode->i_blocks -=
- (old_num - new_num) << 3;
- }
- btrfs_set_file_extent_num_blocks(extent,
- new_num);
- btrfs_mark_buffer_dirty(path->nodes[0]);
- } else {
- WARN_ON(1);
+ break;
+ }
+
+ search_start = extent_end;
+ /*
+ * | ---- range to drop ----- |
+ * | -------- extent -------- |
+ */
+ if (start > key.offset && end >= extent_end) {
+ BUG_ON(del_nr > 0);
+ BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ start - key.offset);
+ btrfs_mark_buffer_dirty(leaf);
+ if (disk_bytenr > 0) {
+ inode_sub_bytes(inode, extent_end - start);
+ *hint_byte = disk_bytenr;
}
+ if (end == extent_end)
+ break;
+
+ path->slots[0]++;
+ goto next_slot;
}
- /* delete the entire extent */
- if (!keep) {
- u64 disk_blocknr = 0;
- u64 disk_num_blocks = 0;
- u64 extent_num_blocks = 0;
- if (found_extent) {
- disk_blocknr =
- btrfs_file_extent_disk_blocknr(extent);
- disk_num_blocks =
- btrfs_file_extent_disk_num_blocks(extent);
- extent_num_blocks =
- btrfs_file_extent_num_blocks(extent);
- *hint_block =
- btrfs_file_extent_disk_blocknr(extent);
+
+ /*
+ * | ---- range to drop ----- |
+ * | ------ extent ------ |
+ */
+ if (start <= key.offset && end >= extent_end) {
+ if (del_nr == 0) {
+ del_slot = path->slots[0];
+ del_nr = 1;
+ } else {
+ BUG_ON(del_slot + del_nr != path->slots[0]);
+ del_nr++;
}
- ret = btrfs_del_item(trans, root, path);
- /* TODO update progress marker and return */
- BUG_ON(ret);
- btrfs_release_path(root, path);
- extent = NULL;
- if (found_extent && disk_blocknr != 0) {
- inode->i_blocks -= extent_num_blocks << 3;
+
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ inode_sub_bytes(inode,
+ extent_end - key.offset);
+ extent_end = ALIGN(extent_end,
+ root->sectorsize);
+ } else if (disk_bytenr > 0) {
ret = btrfs_free_extent(trans, root,
- disk_blocknr,
- disk_num_blocks, 0);
+ disk_bytenr, num_bytes, 0,
+ root->root_key.objectid,
+ key.objectid, key.offset -
+ extent_offset);
+ BUG_ON(ret);
+ inode_sub_bytes(inode,
+ extent_end - key.offset);
+ *hint_byte = disk_bytenr;
}
- BUG_ON(ret);
- if (!bookend && search_start >= end) {
- ret = 0;
- goto out;
+ if (end == extent_end)
+ break;
+
+ if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
+ path->slots[0]++;
+ goto next_slot;
}
- if (!bookend)
- continue;
+
+ ret = btrfs_del_items(trans, root, path, del_slot,
+ del_nr);
+ BUG_ON(ret);
+
+ del_nr = 0;
+ del_slot = 0;
+
+ btrfs_release_path(root, path);
+ continue;
+ }
+
+ BUG_ON(1);
+ }
+
+ if (del_nr > 0) {
+ ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+ BUG_ON(ret);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int extent_mergeable(struct extent_buffer *leaf, int slot,
+ u64 objectid, u64 bytenr, u64 orig_offset,
+ u64 *start, u64 *end)
+{
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ u64 extent_end;
+
+ if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+ return 0;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
+ return 0;
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
+ btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
+ btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
+ btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ return 0;
+
+ extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ if ((*start && *start != key.offset) || (*end && *end != extent_end))
+ return 0;
+
+ *start = key.offset;
+ *end = extent_end;
+ return 1;
+}
+
+/*
+ * Mark extent in the range start - end as written.
+ *
+ * This changes extent type from 'pre-allocated' to 'regular'. If only
+ * part of extent is marked as written, the extent will be split into
+ * two or three.
+ */
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 start, u64 end)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_buffer *leaf;
+ struct btrfs_path *path;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ struct btrfs_key new_key;
+ u64 bytenr;
+ u64 num_bytes;
+ u64 extent_end;
+ u64 orig_offset;
+ u64 other_start;
+ u64 other_end;
+ u64 split;
+ int del_nr = 0;
+ int del_slot = 0;
+ int recow;
+ int ret;
+
+ btrfs_drop_extent_cache(inode, start, end - 1, 0);
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+again:
+ recow = 0;
+ split = start;
+ key.objectid = inode->i_ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = split;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0 && path->slots[0] > 0)
+ path->slots[0]--;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ BUG_ON(key.objectid != inode->i_ino ||
+ key.type != BTRFS_EXTENT_DATA_KEY);
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ BUG_ON(btrfs_file_extent_type(leaf, fi) !=
+ BTRFS_FILE_EXTENT_PREALLOC);
+ extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ BUG_ON(key.offset > start || extent_end < end);
+
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
+ memcpy(&new_key, &key, sizeof(new_key));
+
+ if (start == key.offset && end < extent_end) {
+ other_start = 0;
+ other_end = start;
+ if (extent_mergeable(leaf, path->slots[0] - 1,
+ inode->i_ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ new_key.offset = end;
+ btrfs_set_item_key_safe(trans, root, path, &new_key);
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - end);
+ btrfs_set_file_extent_offset(leaf, fi,
+ end - orig_offset);
+ fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ end - other_start);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
}
- /* create bookend, splitting the extent in two */
- if (bookend && found_extent) {
- struct btrfs_key ins;
- ins.objectid = inode->i_ino;
- ins.offset = end;
- ins.flags = 0;
- btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
+ }
+
+ if (start > key.offset && end == extent_end) {
+ other_start = end;
+ other_end = 0;
+ if (extent_mergeable(leaf, path->slots[0] + 1,
+ inode->i_ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ start - key.offset);
+ path->slots[0]++;
+ new_key.offset = start;
+ btrfs_set_item_key_safe(trans, root, path, &new_key);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ other_end - start);
+ btrfs_set_file_extent_offset(leaf, fi,
+ start - orig_offset);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+ }
+
+ while (start > key.offset || end < extent_end) {
+ if (key.offset == start)
+ split = end;
+
+ new_key.offset = split;
+ ret = btrfs_duplicate_item(trans, root, path, &new_key);
+ if (ret == -EAGAIN) {
btrfs_release_path(root, path);
- ret = btrfs_insert_empty_item(trans, root, path, &ins,
- sizeof(*extent));
+ goto again;
+ }
+ BUG_ON(ret < 0);
- if (ret) {
- btrfs_print_leaf(root, btrfs_buffer_leaf(path->nodes[0]));
- printk("got %d on inserting %Lu %u %Lu start %Lu end %Lu found %Lu %Lu\n", ret , ins.objectid, ins.flags, ins.offset, start, end, key.offset, extent_end);
- }
- BUG_ON(ret);
- extent = btrfs_item_ptr(
- btrfs_buffer_leaf(path->nodes[0]),
- path->slots[0],
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
struct btrfs_file_extent_item);
- btrfs_set_file_extent_disk_blocknr(extent,
- btrfs_file_extent_disk_blocknr(&old));
- btrfs_set_file_extent_disk_num_blocks(extent,
- btrfs_file_extent_disk_num_blocks(&old));
-
- btrfs_set_file_extent_offset(extent,
- btrfs_file_extent_offset(&old) +
- ((end - key.offset) >> inode->i_blkbits));
- WARN_ON(btrfs_file_extent_num_blocks(&old) <
- (extent_end - end) >> inode->i_blkbits);
- btrfs_set_file_extent_num_blocks(extent,
- (extent_end - end) >> inode->i_blkbits);
-
- btrfs_set_file_extent_type(extent,
- BTRFS_FILE_EXTENT_REG);
- btrfs_set_file_extent_generation(extent,
- btrfs_file_extent_generation(&old));
- btrfs_mark_buffer_dirty(path->nodes[0]);
- if (btrfs_file_extent_disk_blocknr(&old) != 0) {
- inode->i_blocks +=
- btrfs_file_extent_num_blocks(extent) << 3;
- }
- ret = 0;
- goto out;
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ split - key.offset);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - split);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
+ root->root_key.objectid,
+ inode->i_ino, orig_offset);
+ BUG_ON(ret);
+
+ if (split == start) {
+ key.offset = start;
+ } else {
+ BUG_ON(start != key.offset);
+ path->slots[0]--;
+ extent_end = end;
+ }
+ recow = 1;
+ }
+
+ other_start = end;
+ other_end = 0;
+ if (extent_mergeable(leaf, path->slots[0] + 1,
+ inode->i_ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ if (recow) {
+ btrfs_release_path(root, path);
+ goto again;
+ }
+ extent_end = other_end;
+ del_slot = path->slots[0] + 1;
+ del_nr++;
+ ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+ 0, root->root_key.objectid,
+ inode->i_ino, orig_offset);
+ BUG_ON(ret);
+ }
+ other_start = 0;
+ other_end = start;
+ if (extent_mergeable(leaf, path->slots[0] - 1,
+ inode->i_ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ if (recow) {
+ btrfs_release_path(root, path);
+ goto again;
}
+ key.offset = other_start;
+ del_slot = path->slots[0];
+ del_nr++;
+ ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+ 0, root->root_key.objectid,
+ inode->i_ino, orig_offset);
+ BUG_ON(ret);
+ }
+ if (del_nr == 0) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG);
+ btrfs_mark_buffer_dirty(leaf);
+ } else {
+ fi = btrfs_item_ptr(leaf, del_slot - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - key.offset);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+ BUG_ON(ret);
}
out:
btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
- * this gets pages into the page cache and locks them down
+ * this gets pages into the page cache and locks them down, it also properly
+ * waits for data=ordered extents to finish before allowing the pages to be
+ * modified.
*/
-static int prepare_pages(struct btrfs_root *root,
- struct file *file,
- struct page **pages,
- size_t num_pages,
- loff_t pos,
- unsigned long first_index,
- unsigned long last_index,
- size_t write_bytes)
+static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
+ struct page **pages, size_t num_pages,
+ loff_t pos, unsigned long first_index,
+ unsigned long last_index, size_t write_bytes)
{
+ struct extent_state *cached_state = NULL;
int i;
unsigned long index = pos >> PAGE_CACHE_SHIFT;
- struct inode *inode = file->f_path.dentry->d_inode;
- int offset;
+ struct inode *inode = fdentry(file)->d_inode;
int err = 0;
- int this_write;
- struct buffer_head *bh;
- struct buffer_head *head;
- loff_t isize = i_size_read(inode);
- struct btrfs_trans_handle *trans;
- u64 hint_block;
- u64 num_blocks;
- u64 alloc_extent_start;
u64 start_pos;
- struct btrfs_key ins;
+ u64 last_pos;
- start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
- num_blocks = (write_bytes + pos - start_pos + root->blocksize - 1) >>
- inode->i_blkbits;
+ start_pos = pos & ~((u64)root->sectorsize - 1);
+ last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
- memset(pages, 0, num_pages * sizeof(struct page *));
+ if (start_pos > inode->i_size) {
+ err = btrfs_cont_expand(inode, start_pos);
+ if (err)
+ return err;
+ }
+ memset(pages, 0, num_pages * sizeof(struct page *));
+again:
for (i = 0; i < num_pages; i++) {
pages[i] = grab_cache_page(inode->i_mapping, index + i);
if (!pages[i]) {
err = -ENOMEM;
- goto failed_release;
+ BUG_ON(1);
}
- cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
wait_on_page_writeback(pages[i]);
}
-
- mutex_lock(&root->fs_info->fs_mutex);
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- err = -ENOMEM;
- mutex_unlock(&root->fs_info->fs_mutex);
- goto out_unlock;
- }
- btrfs_set_trans_block_group(trans, inode);
- /* FIXME blocksize != 4096 */
- inode->i_blocks += num_blocks << 3;
- hint_block = 0;
-
- /* FIXME...EIEIO, ENOSPC and more */
-
- /* step one, delete the existing extents in this range */
- /* FIXME blocksize != pagesize */
if (start_pos < inode->i_size) {
- err = btrfs_drop_extents(trans, root, inode,
- start_pos, (pos + write_bytes + root->blocksize -1) &
- ~((u64)root->blocksize - 1), &hint_block);
- if (err)
- goto failed_release;
- }
-
- /* insert any holes we need to create */
- if (inode->i_size < start_pos) {
- u64 last_pos_in_file;
- u64 hole_size;
- u64 mask = root->blocksize - 1;
- last_pos_in_file = (isize + mask) & ~mask;
- hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
- hole_size >>= inode->i_blkbits;
- if (last_pos_in_file < start_pos) {
- err = btrfs_insert_file_extent(trans, root,
- inode->i_ino,
- last_pos_in_file,
- 0, 0, hole_size);
+ struct btrfs_ordered_extent *ordered;
+ lock_extent_bits(&BTRFS_I(inode)->io_tree,
+ start_pos, last_pos - 1, 0, &cached_state,
+ GFP_NOFS);
+ ordered = btrfs_lookup_first_ordered_extent(inode,
+ last_pos - 1);
+ if (ordered &&
+ ordered->file_offset + ordered->len > start_pos &&
+ ordered->file_offset < last_pos) {
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ start_pos, last_pos - 1,
+ &cached_state, GFP_NOFS);
+ for (i = 0; i < num_pages; i++) {
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ btrfs_wait_ordered_range(inode, start_pos,
+ last_pos - start_pos);
+ goto again;
}
- if (err)
- goto failed_release;
- }
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
- /*
- * either allocate an extent for the new bytes or setup the key
- * to show we are doing inline data in the extent
- */
- if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size ||
- pos + write_bytes - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
- err = btrfs_alloc_extent(trans, root, inode->i_ino,
- num_blocks, hint_block, (u64)-1,
- &ins, 1);
- if (err)
- goto failed_truncate;
- err = btrfs_insert_file_extent(trans, root, inode->i_ino,
- start_pos, ins.objectid, ins.offset,
- ins.offset);
- if (err)
- goto failed_truncate;
- } else {
- ins.offset = 0;
- ins.objectid = 0;
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
+ last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
+ GFP_NOFS);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ start_pos, last_pos - 1, &cached_state,
+ GFP_NOFS);
}
- BUG_ON(err);
- alloc_extent_start = ins.objectid;
- err = btrfs_end_transaction(trans, root);
- mutex_unlock(&root->fs_info->fs_mutex);
-
for (i = 0; i < num_pages; i++) {
- offset = pos & (PAGE_CACHE_SIZE -1);
- this_write = min((size_t)PAGE_CACHE_SIZE - offset, write_bytes);
- if (!page_has_buffers(pages[i])) {
- create_empty_buffers(pages[i],
- root->fs_info->sb->s_blocksize,
- (1 << BH_Uptodate));
- }
- head = page_buffers(pages[i]);
- bh = head;
- do {
- err = btrfs_map_bh_to_logical(root, bh,
- alloc_extent_start);
- BUG_ON(err);
- if (err)
- goto failed_truncate;
- bh = bh->b_this_page;
- if (alloc_extent_start)
- alloc_extent_start++;
- } while (bh != head);
- pos += this_write;
- WARN_ON(this_write > write_bytes);
- write_bytes -= this_write;
+ clear_page_dirty_for_io(pages[i]);
+ set_page_extent_mapped(pages[i]);
+ WARN_ON(!PageLocked(pages[i]));
}
return 0;
-
-failed_release:
- btrfs_drop_pages(pages, num_pages);
- return err;
-
-failed_truncate:
- btrfs_drop_pages(pages, num_pages);
- if (pos > isize)
- vmtruncate(inode, isize);
- return err;
-
-out_unlock:
- mutex_unlock(&root->fs_info->fs_mutex);
- goto failed_release;
-
}
-static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
+ const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
- loff_t pos;
- size_t num_written = 0;
- int err = 0;
- int ret = 0;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct page *pinned[2];
struct page **pages = NULL;
+ struct iov_iter i;
+ loff_t *ppos = &iocb->ki_pos;
+ loff_t start_pos;
+ ssize_t num_written = 0;
+ ssize_t err = 0;
+ size_t count;
+ size_t ocount;
+ int ret = 0;
int nrptrs;
- struct page *pinned[2];
unsigned long first_index;
unsigned long last_index;
+ int will_write;
+ int buffered = 0;
+
+ will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
+ (file->f_flags & O_DIRECT));
- nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
- PAGE_CACHE_SIZE / (sizeof(struct page *)));
pinned[0] = NULL;
pinned[1] = NULL;
- if (file->f_flags & O_DIRECT)
- return -EINVAL;
- pos = *ppos;
+
+ start_pos = pos;
+
vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+ mutex_lock(&inode->i_mutex);
+
+ err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+ if (err)
+ goto out;
+ count = ocount;
+
current->backing_dev_info = inode->i_mapping->backing_dev_info;
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
goto out;
+
if (count == 0)
goto out;
- err = remove_suid(file->f_path.dentry);
+
+ err = file_remove_suid(file);
if (err)
goto out;
+
file_update_time(file);
+ BTRFS_I(inode)->sequence++;
+
+ if (unlikely(file->f_flags & O_DIRECT)) {
+ num_written = generic_file_direct_write(iocb, iov, &nr_segs,
+ pos, ppos, count,
+ ocount);
+ /*
+ * the generic O_DIRECT will update in-memory i_size after the
+ * DIOs are done. But our endio handlers that update the on
+ * disk i_size never update past the in memory i_size. So we
+ * need one more update here to catch any additions to the
+ * file
+ */
+ if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
+ btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+ mark_inode_dirty(inode);
+ }
+
+ if (num_written < 0) {
+ ret = num_written;
+ num_written = 0;
+ goto out;
+ } else if (num_written == count) {
+ /* pick up pos changes done by the generic code */
+ pos = *ppos;
+ goto out;
+ }
+ /*
+ * We are going to do buffered for the rest of the range, so we
+ * need to make sure to invalidate the buffered pages when we're
+ * done.
+ */
+ buffered = 1;
+ pos += num_written;
+ }
+ iov_iter_init(&i, iov, nr_segs, count, num_written);
+ nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
+ PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
+ (sizeof(struct page *)));
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
- mutex_lock(&inode->i_mutex);
+ /* generic_write_checks can change our pos */
+ start_pos = pos;
+
first_index = pos >> PAGE_CACHE_SHIFT;
- last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+ last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
/*
* there are lots of better ways to do this, but this code
unlock_page(pinned[0]);
}
}
- if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
+ if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
pinned[1] = grab_cache_page(inode->i_mapping, last_index);
if (!PageUptodate(pinned[1])) {
ret = btrfs_readpage(NULL, pinned[1]);
}
}
- while(count > 0) {
+ while (iov_iter_count(&i) > 0) {
size_t offset = pos & (PAGE_CACHE_SIZE - 1);
- size_t write_bytes = min(count, nrptrs *
- (size_t)PAGE_CACHE_SIZE -
+ size_t write_bytes = min(iov_iter_count(&i),
+ nrptrs * (size_t)PAGE_CACHE_SIZE -
offset);
size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
PAGE_CACHE_SHIFT;
WARN_ON(num_pages > nrptrs);
- memset(pages, 0, sizeof(pages));
+ memset(pages, 0, sizeof(struct page *) * nrptrs);
+
+ ret = btrfs_delalloc_reserve_space(inode, write_bytes);
+ if (ret)
+ goto out;
+
ret = prepare_pages(root, file, pages, num_pages,
pos, first_index, last_index,
write_bytes);
- if (ret)
+ if (ret) {
+ btrfs_delalloc_release_space(inode, write_bytes);
goto out;
+ }
ret = btrfs_copy_from_user(pos, num_pages,
- write_bytes, pages, buf);
- if (ret) {
- btrfs_drop_pages(pages, num_pages);
- goto out;
+ write_bytes, pages, &i);
+ if (ret == 0) {
+ dirty_and_release_pages(NULL, root, file, pages,
+ num_pages, pos, write_bytes);
}
- ret = dirty_and_release_pages(NULL, root, file, pages,
- num_pages, pos, write_bytes);
btrfs_drop_pages(pages, num_pages);
- if (ret)
+ if (ret) {
+ btrfs_delalloc_release_space(inode, write_bytes);
goto out;
+ }
+
+ if (will_write) {
+ filemap_fdatawrite_range(inode->i_mapping, pos,
+ pos + write_bytes - 1);
+ } else {
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+ num_pages);
+ if (num_pages <
+ (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+ btrfs_btree_balance_dirty(root, 1);
+ btrfs_throttle(root);
+ }
- buf += write_bytes;
- count -= write_bytes;
pos += write_bytes;
num_written += write_bytes;
- balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages);
- btrfs_btree_balance_dirty(root);
cond_resched();
}
- mutex_unlock(&inode->i_mutex);
out:
+ mutex_unlock(&inode->i_mutex);
+ if (ret)
+ err = ret;
+
kfree(pages);
if (pinned[0])
page_cache_release(pinned[0]);
if (pinned[1])
page_cache_release(pinned[1]);
*ppos = pos;
+
+ /*
+ * we want to make sure fsync finds this change
+ * but we haven't joined a transaction running right now.
+ *
+ * Later on, someone is sure to update the inode and get the
+ * real transid recorded.
+ *
+ * We set last_trans now to the fs_info generation + 1,
+ * this will either be one more than the running transaction
+ * or the generation used for the next transaction if there isn't
+ * one running right now.
+ */
+ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+
+ if (num_written > 0 && will_write) {
+ struct btrfs_trans_handle *trans;
+
+ err = btrfs_wait_ordered_range(inode, start_pos, num_written);
+ if (err)
+ num_written = err;
+
+ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+ trans = btrfs_start_transaction(root, 0);
+ ret = btrfs_log_dentry_safe(trans, root,
+ file->f_dentry);
+ if (ret == 0) {
+ ret = btrfs_sync_log(trans, root);
+ if (ret == 0)
+ btrfs_end_transaction(trans, root);
+ else
+ btrfs_commit_transaction(trans, root);
+ } else if (ret != BTRFS_NO_LOG_SYNC) {
+ btrfs_commit_transaction(trans, root);
+ } else {
+ btrfs_end_transaction(trans, root);
+ }
+ }
+ if (file->f_flags & O_DIRECT && buffered) {
+ invalidate_mapping_pages(inode->i_mapping,
+ start_pos >> PAGE_CACHE_SHIFT,
+ (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
+ }
+ }
current->backing_dev_info = NULL;
- mark_inode_dirty(inode);
return num_written ? num_written : err;
}
-static int btrfs_sync_file(struct file *file,
- struct dentry *dentry, int datasync)
+int btrfs_release_file(struct inode *inode, struct file *filp)
+{
+ /*
+ * ordered_data_close is set by settattr when we are about to truncate
+ * a file from a non-zero size to a zero size. This tries to
+ * flush down new bytes that may have been written if the
+ * application were using truncate to replace a file in place.
+ */
+ if (BTRFS_I(inode)->ordered_data_close) {
+ BTRFS_I(inode)->ordered_data_close = 0;
+ btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+ if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+ filemap_flush(inode->i_mapping);
+ }
+ if (filp->private_data)
+ btrfs_ioctl_trans_end(filp);
+ return 0;
+}
+
+/*
+ * fsync call for both files and directories. This logs the inode into
+ * the tree log instead of forcing full commits whenever possible.
+ *
+ * It needs to call filemap_fdatawait so that all ordered extent updates are
+ * in the metadata btree are up to date for copying to the log.
+ *
+ * It drops the inode mutex before doing the tree log commit. This is an
+ * important optimization for directories because holding the mutex prevents
+ * new operations on the dir while we write to disk.
+ */
+int btrfs_sync_file(struct file *file, int datasync)
{
+ struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret;
+ int ret = 0;
struct btrfs_trans_handle *trans;
+
+ /* we wait first, since the writeback may change the inode */
+ root->log_batch++;
+ /* the VFS called filemap_fdatawrite for us */
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ root->log_batch++;
+
+ /*
+ * check the transaction that last modified this inode
+ * and see if its already been committed
+ */
+ if (!BTRFS_I(inode)->last_trans)
+ goto out;
+
+ /*
+ * if the last transaction that changed this file was before
+ * the current transaction, we can bail out now without any
+ * syncing
+ */
+ mutex_lock(&root->fs_info->trans_mutex);
+ if (BTRFS_I(inode)->last_trans <=
+ root->fs_info->last_trans_committed) {
+ BTRFS_I(inode)->last_trans = 0;
+ mutex_unlock(&root->fs_info->trans_mutex);
+ goto out;
+ }
+ mutex_unlock(&root->fs_info->trans_mutex);
+
/*
- * FIXME, use inode generation number to check if we can skip the
- * commit
+ * ok we haven't committed the transaction yet, lets do a commit
*/
- mutex_lock(&root->fs_info->fs_mutex);
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- ret = -ENOMEM;
+ if (file && file->private_data)
+ btrfs_ioctl_trans_end(file);
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ ret = btrfs_log_dentry_safe(trans, root, dentry);
+ if (ret < 0)
goto out;
+
+ /* we've logged all the items and now have a consistent
+ * version of the file in the log. It is possible that
+ * someone will come in and modify the file, but that's
+ * fine because the log is consistent on disk, and we
+ * have references to all of the file's extents
+ *
+ * It is possible that someone will come in and log the
+ * file again, but that will end up using the synchronization
+ * inside btrfs_sync_log to keep things safe.
+ */
+ mutex_unlock(&dentry->d_inode->i_mutex);
+
+ if (ret != BTRFS_NO_LOG_SYNC) {
+ if (ret > 0) {
+ ret = btrfs_commit_transaction(trans, root);
+ } else {
+ ret = btrfs_sync_log(trans, root);
+ if (ret == 0)
+ ret = btrfs_end_transaction(trans, root);
+ else
+ ret = btrfs_commit_transaction(trans, root);
+ }
+ } else {
+ ret = btrfs_end_transaction(trans, root);
}
- ret = btrfs_commit_transaction(trans, root);
- mutex_unlock(&root->fs_info->fs_mutex);
+ mutex_lock(&dentry->d_inode->i_mutex);
out:
- return ret > 0 ? EIO : ret;
+ return ret > 0 ? -EIO : ret;
}
-static struct vm_operations_struct btrfs_file_vm_ops = {
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
- .nopage = filemap_nopage,
- .populate = filemap_populate,
-#else
+static const struct vm_operations_struct btrfs_file_vm_ops = {
.fault = filemap_fault,
-#endif
.page_mkwrite = btrfs_page_mkwrite,
};
return 0;
}
-struct file_operations btrfs_file_operations = {
+const struct file_operations btrfs_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.aio_read = generic_file_aio_read,
- .write = btrfs_file_write,
+ .splice_read = generic_file_splice_read,
+ .aio_write = btrfs_file_aio_write,
.mmap = btrfs_file_mmap,
.open = generic_file_open,
- .ioctl = btrfs_ioctl,
+ .release = btrfs_release_file,
.fsync = btrfs_sync_file,
+ .unlocked_ioctl = btrfs_ioctl,
#ifdef CONFIG_COMPAT
- .compat_ioctl = btrfs_compat_ioctl,
+ .compat_ioctl = btrfs_ioctl,
#endif
};
-