X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=fs%2Focfs2%2Ffile.c;h=06ccf6a86d35ca01a8f80e5a997a30ea8820f30a;hb=90aeb7c01c2da631cb611871a50980cbb6ca7149;hp=f516619a374481eb8a507b687a8cd6cbc5901dc9;hpb=49cb8d2d496ce06869ccca2ab368ed6b0b5b979d;p=safe%2Fjmp%2Flinux-2.6 diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index f516619..06ccf6a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -31,9 +31,11 @@ #include #include #include -#include +#include #include #include +#include +#include #define MLOG_MASK_PREFIX ML_INODE #include @@ -50,9 +52,14 @@ #include "inode.h" #include "ioctl.h" #include "journal.h" +#include "locks.h" #include "mmap.h" #include "suballoc.h" #include "super.h" +#include "xattr.h" +#include "acl.h" +#include "quota.h" +#include "refcounttree.h" #include "buffer_head_io.h" @@ -62,6 +69,35 @@ static int ocfs2_sync_inode(struct inode *inode) return sync_mapping_buffers(inode->i_mapping); } +static int ocfs2_init_file_private(struct inode *inode, struct file *file) +{ + struct ocfs2_file_private *fp; + + fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL); + if (!fp) + return -ENOMEM; + + fp->fp_file = file; + mutex_init(&fp->fp_mutex); + ocfs2_file_lock_res_init(&fp->fp_flock, fp); + file->private_data = fp; + + return 0; +} + +static void ocfs2_free_file_private(struct inode *inode, struct file *file) +{ + struct ocfs2_file_private *fp = file->private_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (fp) { + ocfs2_simple_drop_lockres(osb, &fp->fp_flock); + ocfs2_lock_res_free(&fp->fp_flock); + kfree(fp); + file->private_data = NULL; + } +} + static int ocfs2_file_open(struct inode *inode, struct file *file) { int status; @@ -88,7 +124,18 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) oi->ip_open_count++; spin_unlock(&oi->ip_lock); - status = 0; + + status = ocfs2_init_file_private(inode, file); + if (status) { + /* + * We want to set open count back if we're failing the + * open. + */ + spin_lock(&oi->ip_lock); + oi->ip_open_count--; + spin_unlock(&oi->ip_lock); + } + leave: mlog_exit(status); return status; @@ -107,11 +154,24 @@ static int ocfs2_file_release(struct inode *inode, struct file *file) oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; spin_unlock(&oi->ip_lock); + ocfs2_free_file_private(inode, file); + mlog_exit(0); return 0; } +static int ocfs2_dir_open(struct inode *inode, struct file *file) +{ + return ocfs2_init_file_private(inode, file); +} + +static int ocfs2_dir_release(struct inode *inode, struct file *file) +{ + ocfs2_free_file_private(inode, file); + return 0; +} + static int ocfs2_sync_file(struct file *file, struct dentry *dentry, int datasync) @@ -128,8 +188,11 @@ static int ocfs2_sync_file(struct file *file, if (err) goto bail; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + goto bail; + journal = osb->journal->j_journal; - err = journal_force_commit(journal); + err = jbd2_journal_force_commit(journal); bail: mlog_exit(err); @@ -186,37 +249,54 @@ int ocfs2_update_inode_atime(struct inode *inode, int ret; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); handle_t *handle; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data; mlog_entry_void(); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (handle == NULL) { - ret = -ENOMEM; + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); mlog_errno(ret); goto out; } + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * Don't use ocfs2_mark_inode_dirty() here as we don't always + * have i_mutex to guard against concurrent changes to other + * inode fields. + */ inode->i_atime = CURRENT_TIME; - ret = ocfs2_mark_inode_dirty(handle, inode, bh); + di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); + di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + + ret = ocfs2_journal_dirty(handle, bh); if (ret < 0) mlog_errno(ret); +out_commit: ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: mlog_exit(ret); return ret; } -int ocfs2_set_inode_size(handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - u64 new_i_size) +static int ocfs2_set_inode_size(handle_t *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 new_i_size) { int status; mlog_entry_void(); i_size_write(inode, new_i_size); - inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); + inode->i_blocks = ocfs2_inode_sector_count(inode); inode->i_ctime = inode->i_mtime = CURRENT_TIME; status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); @@ -230,17 +310,17 @@ bail: return status; } -static int ocfs2_simple_size_update(struct inode *inode, - struct buffer_head *di_bh, - u64 new_i_size) +int ocfs2_simple_size_update(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size) { int ret; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); handle_t *handle = NULL; handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (handle == NULL) { - ret = -ENOMEM; + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); mlog_errno(ret); goto out; } @@ -255,6 +335,39 @@ out: return ret; } +static int ocfs2_cow_file_pos(struct inode *inode, + struct buffer_head *fe_bh, + u64 offset) +{ + int status; + u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + /* + * If the new offset is aligned to the range of the cluster, there is + * no space for ocfs2_zero_range_for_truncate to fill, so no need to + * CoW either. + */ + if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0) + return 0; + + status = ocfs2_get_clusters(inode, cpos, &phys, + &num_clusters, &ext_flags); + if (status) { + mlog_errno(status); + goto out; + } + + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) + goto out; + + return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); + +out: + return status; +} + static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh, @@ -263,9 +376,21 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, int status; handle_t *handle; struct ocfs2_dinode *di; + u64 cluster_bytes; mlog_entry_void(); + /* + * We need to CoW the cluster contains the offset if it is reflinked + * since we will call ocfs2_zero_range_for_truncate later which will + * write "0" from offset to the end of the cluster. + */ + status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size); + if (status) { + mlog_errno(status); + return status; + } + /* TODO: This needs to actually orphan the inode in this * transaction. */ @@ -276,8 +401,8 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, goto out; } - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_commit; @@ -286,14 +411,15 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, /* * Do this before setting i_size. */ - status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size); + cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); + status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size, + cluster_bytes); if (status) { mlog_errno(status); goto out_commit; } i_size_write(inode, new_i_size); - inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); inode->i_ctime = inode->i_mtime = CURRENT_TIME; di = (struct ocfs2_dinode *) fe_bh->b_data; @@ -326,14 +452,9 @@ static int ocfs2_truncate_file(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)new_i_size); - truncate_inode_pages(inode->i_mapping, new_i_size); - + /* We trust di_bh because it comes from ocfs2_inode_lock(), which + * already validated it */ fe = (struct ocfs2_dinode *) di_bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); - status = -EIO; - goto bail; - } mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), "Inode %llu, inode i_size = %lld != di " @@ -362,14 +483,25 @@ static int ocfs2_truncate_file(struct inode *inode, if (new_i_size == le64_to_cpu(fe->i_size)) goto bail; - /* This forces other nodes to sync and drop their pages. Do - * this even if we have a truncate without allocation change - - * ocfs2 cluster sizes can be much greater than page size, so - * we have to truncate them anyway. */ - status = ocfs2_data_lock(inode, 1); - if (status < 0) { - mlog_errno(status); - goto bail; + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + /* + * The inode lock forced other nodes to sync and drop their + * pages, which (correctly) happens even if we have a truncate + * without allocation change - ocfs2 cluster sizes can be much + * greater than page size, so we have to truncate them + * anyway. + */ + unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(inode->i_mapping, new_i_size); + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + status = ocfs2_truncate_inline(inode, di_bh, new_i_size, + i_size_read(inode), 1); + if (status) + mlog_errno(status); + + goto bail_unlock_sem; } /* alright, we're going to need to do a full blown alloc size @@ -379,33 +511,35 @@ static int ocfs2_truncate_file(struct inode *inode, status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); if (status < 0) { mlog_errno(status); - goto bail_unlock_data; + goto bail_unlock_sem; } status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); if (status < 0) { mlog_errno(status); - goto bail_unlock_data; + goto bail_unlock_sem; } status = ocfs2_commit_truncate(osb, inode, di_bh, tc); if (status < 0) { mlog_errno(status); - goto bail_unlock_data; + goto bail_unlock_sem; } /* TODO: orphan dir cleanup here. */ -bail_unlock_data: - ocfs2_data_unlock(inode, 1); +bail_unlock_sem: + up_write(&OCFS2_I(inode)->ip_alloc_sem); bail: + if (!status && OCFS2_I(inode)->ip_clusters == 0) + status = ocfs2_try_remove_refcount_tree(inode, di_bh); mlog_exit(status); return status; } /* - * extend allocation only here. + * extend file allocation only here. * we'll update all the disk stuff, and oip->alloc_size * * expect stuff to be locked, a transaction started and enough data / @@ -414,184 +548,35 @@ bail: * Will return -EAGAIN, and a reason if a restart is needed. * If passed in, *reason will always be set, even in error. */ -int ocfs2_do_extend_allocation(struct ocfs2_super *osb, - struct inode *inode, - u32 *logical_offset, - u32 clusters_to_add, - struct buffer_head *fe_bh, - handle_t *handle, - struct ocfs2_alloc_context *data_ac, - struct ocfs2_alloc_context *meta_ac, - enum ocfs2_alloc_restarted *reason_ret) -{ - int status = 0; - int free_extents; - struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; - enum ocfs2_alloc_restarted reason = RESTART_NONE; - u32 bit_off, num_bits; - u64 block; - - BUG_ON(!clusters_to_add); - - free_extents = ocfs2_num_free_extents(osb, inode, fe); - if (free_extents < 0) { - status = free_extents; - mlog_errno(status); - goto leave; - } - - /* there are two cases which could cause us to EAGAIN in the - * we-need-more-metadata case: - * 1) we haven't reserved *any* - * 2) we are so fragmented, we've needed to add metadata too - * many times. */ - if (!free_extents && !meta_ac) { - mlog(0, "we haven't reserved any metadata!\n"); - status = -EAGAIN; - reason = RESTART_META; - goto leave; - } else if ((!free_extents) - && (ocfs2_alloc_context_bits_left(meta_ac) - < ocfs2_extend_meta_needed(fe))) { - mlog(0, "filesystem is really fragmented...\n"); - status = -EAGAIN; - reason = RESTART_META; - goto leave; - } - - status = ocfs2_claim_clusters(osb, handle, data_ac, 1, - &bit_off, &num_bits); - if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); - goto leave; - } - - BUG_ON(num_bits > clusters_to_add); - - /* reserve our write early -- insert_extent may update the inode */ - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - block = ocfs2_clusters_to_blocks(osb->sb, bit_off); - mlog(0, "Allocating %u clusters at block %u for inode %llu\n", - num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_insert_extent(osb, handle, inode, fe_bh, - *logical_offset, block, num_bits, - meta_ac); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - clusters_to_add -= num_bits; - *logical_offset += num_bits; - - if (clusters_to_add) { - mlog(0, "need to alloc once more, clusters = %u, wanted = " - "%u\n", fe->i_clusters, clusters_to_add); - status = -EAGAIN; - reason = RESTART_TRANS; - } - -leave: - mlog_exit(status); - if (reason_ret) - *reason_ret = reason; - return status; -} - -/* - * For a given allocation, determine which allocators will need to be - * accessed, and lock them, reserving the appropriate number of bits. - * - * Called from ocfs2_extend_allocation() for file systems which don't - * support holes, and from ocfs2_write() for file systems which - * understand sparse inodes. - */ -int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, - u32 clusters_to_add, - struct ocfs2_alloc_context **data_ac, - struct ocfs2_alloc_context **meta_ac) +int ocfs2_add_inode_data(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct buffer_head *fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret) { - int ret, num_free_extents; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - - *meta_ac = NULL; - *data_ac = NULL; - - mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " - "clusters_to_add = %u\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), - le32_to_cpu(di->i_clusters), clusters_to_add); - - num_free_extents = ocfs2_num_free_extents(osb, inode, di); - if (num_free_extents < 0) { - ret = num_free_extents; - mlog_errno(ret); - goto out; - } - - /* - * Sparse allocation file systems need to be more conservative - * with reserving room for expansion - the actual allocation - * happens while we've got a journal handle open so re-taking - * a cluster lock (because we ran out of room for another - * extent) will violate ordering rules. - * - * Most of the time we'll only be seeing this 1 cluster at a time - * anyway. - */ - if (!num_free_extents || - (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) { - ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto out; - } - } - - ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto out; - } - -out: - if (ret) { - if (*meta_ac) { - ocfs2_free_alloc_context(*meta_ac); - *meta_ac = NULL; - } + int ret; + struct ocfs2_extent_tree et; - /* - * We cannot have an error and a non null *data_ac. - */ - } + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh); + ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset, + clusters_to_add, mark_unwritten, + data_ac, meta_ac, reason_ret); return ret; } -static int ocfs2_extend_allocation(struct inode *inode, - u32 clusters_to_add) +static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten) { int status = 0; int restart_func = 0; - int drop_alloc_sem = 0; int credits; - u32 prev_clusters, logical_start; + u32 prev_clusters; struct buffer_head *bh = NULL; struct ocfs2_dinode *fe = NULL; handle_t *handle = NULL; @@ -599,6 +584,8 @@ static int ocfs2_extend_allocation(struct inode *inode, struct ocfs2_alloc_context *meta_ac = NULL; enum ocfs2_alloc_restarted why; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_tree et; + int did_quota = 0; mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); @@ -606,42 +593,33 @@ static int ocfs2_extend_allocation(struct inode *inode, * This function only exists for file systems which don't * support holes. */ - BUG_ON(ocfs2_sparse_alloc(osb)); + BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, - OCFS2_BH_CACHED, inode); + status = ocfs2_read_inode_block(inode, &bh); if (status < 0) { mlog_errno(status); goto leave; } - fe = (struct ocfs2_dinode *) bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); - status = -EIO; - goto leave; - } - - logical_start = OCFS2_I(inode)->ip_clusters; restart_all: BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); - /* blocks peope in read/write from reading our allocation - * until we're done changing it. We depend on i_mutex to block - * other extend/truncate calls while we're here. Ordering wrt - * start_trans is important here -- always do it before! */ - down_write(&OCFS2_I(inode)->ip_alloc_sem); - drop_alloc_sem = 1; - - status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac, - &meta_ac); + mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " + "clusters_to_add = %u\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters), + clusters_to_add); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh); + status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, + &data_ac, &meta_ac); if (status) { mlog_errno(status); goto leave; } - credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); + credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list, + clusters_to_add); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); @@ -651,11 +629,18 @@ restart_all: } restarted_transaction: + if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, + clusters_to_add))) { + status = -EDQUOT; + goto leave; + } + did_quota = 1; + /* reserve a write to the file entry early on - that we if we * run out of credits in the allocation path, we can still * update i_size. */ - status = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; @@ -663,15 +648,16 @@ restarted_transaction: prev_clusters = OCFS2_I(inode)->ip_clusters; - status = ocfs2_do_extend_allocation(osb, - inode, - &logical_start, - clusters_to_add, - bh, - handle, - data_ac, - meta_ac, - &why); + status = ocfs2_add_inode_data(osb, + inode, + &logical_start, + clusters_to_add, + mark_unwritten, + bh, + handle, + data_ac, + meta_ac, + &why); if ((status < 0) && (status != -EAGAIN)) { if (status != -ENOSPC) mlog_errno(status); @@ -687,6 +673,10 @@ restarted_transaction: spin_lock(&OCFS2_I(inode)->ip_lock); clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); spin_unlock(&OCFS2_I(inode)->ip_lock); + /* Release unused quota reservation */ + vfs_dq_free_space(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); + did_quota = 0; if (why != RESTART_NONE && clusters_to_add) { if (why == RESTART_META) { @@ -698,7 +688,7 @@ restarted_transaction: mlog(0, "restarting transaction.\n"); /* TODO: This can be more intelligent. */ credits = ocfs2_calc_extend_credits(osb->sb, - fe, + &fe->id2.i_list, clusters_to_add); status = ocfs2_extend_trans(handle, credits); if (status < 0) { @@ -713,15 +703,15 @@ restarted_transaction: } mlog(0, "fe: i_clusters = %u, i_size=%llu\n", - fe->i_clusters, (unsigned long long)fe->i_size); + le32_to_cpu(fe->i_clusters), + (unsigned long long)le64_to_cpu(fe->i_size)); mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", - OCFS2_I(inode)->ip_clusters, i_size_read(inode)); + OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode)); leave: - if (drop_alloc_sem) { - up_write(&OCFS2_I(inode)->ip_alloc_sem); - drop_alloc_sem = 0; - } + if (status < 0 && did_quota) + vfs_dq_free_space(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); if (handle) { ocfs2_commit_trans(osb, handle); handle = NULL; @@ -738,10 +728,8 @@ leave: restart_func = 0; goto restart_all; } - if (bh) { - brelse(bh); - bh = NULL; - } + brelse(bh); + bh = NULL; mlog_exit(status); return status; @@ -749,8 +737,7 @@ leave: /* Some parts of this taken from generic_cont_expand, which turned out * to be too fragile to do exactly what we need without us having to - * worry about recursive locking in ->prepare_write() and - * ->commit_write(). */ + * worry about recursive locking in ->write_begin() and ->write_end(). */ static int ocfs2_write_zero_page(struct inode *inode, u64 size) { @@ -838,25 +825,48 @@ out: return ret; } -/* - * A tail_to_skip value > 0 indicates that we're being called from - * ocfs2_file_aio_write(). This has the following implications: - * - * - we don't want to update i_size - * - di_bh will be NULL, which is fine because it's only used in the - * case where we want to update i_size. - * - ocfs2_zero_extend() will then only be filling the hole created - * between i_size and the start of the write. - */ +int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) +{ + int ret; + u32 clusters_to_add; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); + if (clusters_to_add < oi->ip_clusters) + clusters_to_add = 0; + else + clusters_to_add -= oi->ip_clusters; + + if (clusters_to_add) { + ret = __ocfs2_extend_allocation(inode, oi->ip_clusters, + clusters_to_add, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * Call this even if we don't add any clusters to the tree. We + * still need to zero the area between the old i_size and the + * new i_size. + */ + ret = ocfs2_zero_extend(inode, zero_to); + if (ret < 0) + mlog_errno(ret); + +out: + return ret; +} + static int ocfs2_extend_file(struct inode *inode, struct buffer_head *di_bh, - u64 new_i_size, - size_t tail_to_skip) + u64 new_i_size) { int ret = 0; - u32 clusters_to_add = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); - BUG_ON(!tail_to_skip && !di_bh); + BUG_ON(!di_bh); /* setattr sometimes calls us like this. */ if (new_i_size == 0) @@ -866,58 +876,60 @@ static int ocfs2_extend_file(struct inode *inode, goto out; BUG_ON(new_i_size < i_size_read(inode)); - if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { - BUG_ON(tail_to_skip != 0); + /* + * Fall through for converting inline data, even if the fs + * supports sparse files. + * + * The check for inline data here is legal - nobody can add + * the feature since we have i_mutex. We must check it again + * after acquiring ip_alloc_sem though, as paths like mmap + * might have raced us to converting the inode to extents. + */ + if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) + && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) goto out_update_size; - } - clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - - OCFS2_I(inode)->ip_clusters; - - /* - * protect the pages that ocfs2_zero_extend is going to be - * pulling into the page cache.. we do this before the - * metadata extend so that we don't get into the situation - * where we've extended the metadata but can't get the data - * lock to zero. + /* + * The alloc sem blocks people in read/write from reading our + * allocation until we're done changing it. We depend on + * i_mutex to block other extend/truncate calls while we're + * here. */ - ret = ocfs2_data_lock(inode, 1); - if (ret < 0) { - mlog_errno(ret); - goto out; - } + down_write(&oi->ip_alloc_sem); + + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + /* + * We can optimize small extends by keeping the inodes + * inline data. + */ + if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) { + up_write(&oi->ip_alloc_sem); + goto out_update_size; + } + + ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); + if (ret) { + up_write(&oi->ip_alloc_sem); - if (clusters_to_add) { - ret = ocfs2_extend_allocation(inode, clusters_to_add); - if (ret < 0) { mlog_errno(ret); - goto out_unlock; + goto out; } } - /* - * Call this even if we don't add any clusters to the tree. We - * still need to zero the area between the old i_size and the - * new i_size. - */ - ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip); + if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size); + + up_write(&oi->ip_alloc_sem); + if (ret < 0) { mlog_errno(ret); - goto out_unlock; + goto out; } out_update_size: - if (!tail_to_skip) { - /* We're being called from ocfs2_setattr() which wants - * us to update i_size */ - ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); - if (ret < 0) - mlog_errno(ret); - } - -out_unlock: - if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) - ocfs2_data_unlock(inode, 1); + ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); + if (ret < 0) + mlog_errno(ret); out: return ret; @@ -931,10 +943,17 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) struct ocfs2_super *osb = OCFS2_SB(sb); struct buffer_head *bh = NULL; handle_t *handle = NULL; + int qtype; + struct dquot *transfer_from[MAXQUOTAS] = { }; + struct dquot *transfer_to[MAXQUOTAS] = { }; mlog_entry("(0x%p, '%.*s')\n", dentry, dentry->d_name.len, dentry->d_name.name); + /* ensuring we don't even attempt to truncate a symlink */ + if (S_ISLNK(inode->i_mode)) + attr->ia_valid &= ~ATTR_SIZE; + if (attr->ia_valid & ATTR_MODE) mlog(0, "mode change: %d\n", attr->ia_mode); if (attr->ia_valid & ATTR_UID) @@ -966,7 +985,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } } - status = ocfs2_meta_lock(inode, &bh, 1); + status = ocfs2_inode_lock(inode, &bh, 1); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -974,10 +993,21 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } if (size_change && attr->ia_size != i_size_read(inode)) { - if (i_size_read(inode) > attr->ia_size) - status = ocfs2_truncate_file(inode, bh, attr->ia_size); - else - status = ocfs2_extend_file(inode, bh, attr->ia_size, 0); + if (attr->ia_size > sb->s_maxbytes) { + status = -EFBIG; + goto bail_unlock; + } + + if (i_size_read(inode) > attr->ia_size) { + if (ocfs2_should_order_data(inode)) { + status = ocfs2_begin_ordered_truncate(inode, + attr->ia_size); + if (status) + goto bail_unlock; + } + status = ocfs2_truncate_file(inode, bh, attr->ia_size); + } else + status = ocfs2_extend_file(inode, bh, attr->ia_size); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -986,13 +1016,63 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } } - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto bail_unlock; + if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + /* + * Gather pointers to quota structures so that allocation / + * freeing of quota structures happens here and not inside + * vfs_dq_transfer() where we have problems with lock ordering + */ + if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid + && OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { + transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid, + USRQUOTA); + transfer_from[USRQUOTA] = dqget(sb, inode->i_uid, + USRQUOTA); + if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) { + status = -ESRCH; + goto bail_unlock; + } + } + if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid + && OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { + transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid, + GRPQUOTA); + transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid, + GRPQUOTA); + if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) { + status = -ESRCH; + goto bail_unlock; + } + } + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS + + 2 * ocfs2_quota_trans_credits(sb)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } + status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + if (status < 0) + goto bail_commit; + } else { + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } } + /* + * This will intentionally not wind up calling vmtruncate(), + * since all the work for a size change has been done above. + * Otherwise, we could get into problems with truncate as + * ip_alloc_sem is used there to protect against i_size + * changes. + */ status = inode_setattr(inode, attr); if (status < 0) { mlog_errno(status); @@ -1006,13 +1086,24 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: - ocfs2_meta_unlock(inode, 1); + ocfs2_inode_unlock(inode, 1); bail_unlock_rw: if (size_change) ocfs2_rw_unlock(inode, 1); bail: - if (bh) - brelse(bh); + brelse(bh); + + /* Release quota pointers in case we acquired them */ + for (qtype = 0; qtype < MAXQUOTAS; qtype++) { + dqput(transfer_to[qtype]); + dqput(transfer_from[qtype]); + } + + if (!status && attr->ia_valid & ATTR_MODE) { + status = ocfs2_acl_chmod(inode); + if (status < 0) + mlog_errno(status); + } mlog_exit(status); return status; @@ -1047,59 +1138,52 @@ bail: return err; } -int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) +int ocfs2_permission(struct inode *inode, int mask) { int ret; mlog_entry_void(); - ret = ocfs2_meta_lock(inode, NULL, 0); + ret = ocfs2_inode_lock(inode, NULL, 0); if (ret) { if (ret != -ENOENT) mlog_errno(ret); goto out; } - ret = generic_permission(inode, mask, NULL); + ret = generic_permission(inode, mask, ocfs2_check_acl); - ocfs2_meta_unlock(inode, 0); + ocfs2_inode_unlock(inode, 0); out: mlog_exit(ret); return ret; } -static int ocfs2_write_remove_suid(struct inode *inode) +static int __ocfs2_write_remove_suid(struct inode *inode, + struct buffer_head *bh) { int ret; - struct buffer_head *bh = NULL; - struct ocfs2_inode_info *oi = OCFS2_I(inode); handle_t *handle; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di; mlog_entry("(Inode %llu, mode 0%o)\n", - (unsigned long long)oi->ip_blkno, inode->i_mode); + (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (handle == NULL) { - ret = -ENOMEM; + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); mlog_errno(ret); goto out; } - ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out_trans; } - ret = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto out_bh; - } - inode->i_mode &= ~S_ISUID; if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) inode->i_mode &= ~S_ISGID; @@ -1110,27 +1194,528 @@ static int ocfs2_write_remove_suid(struct inode *inode) ret = ocfs2_journal_dirty(handle, bh); if (ret < 0) mlog_errno(ret); -out_bh: - brelse(bh); -out_trans: + +out_trans: + ocfs2_commit_trans(osb, handle); +out: + mlog_exit(ret); + return ret; +} + +/* + * Will look for holes and unwritten extents in the range starting at + * pos for count bytes (inclusive). + */ +static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, + size_t count) +{ + int ret = 0; + unsigned int extent_flags; + u32 cpos, clusters, extent_len, phys_cpos; + struct super_block *sb = inode->i_sb; + + cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; + clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; + + while (clusters) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, + &extent_flags); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { + ret = 1; + break; + } + + if (extent_len > clusters) + extent_len = clusters; + + clusters -= extent_len; + cpos += extent_len; + } +out: + return ret; +} + +static int ocfs2_write_remove_suid(struct inode *inode) +{ + int ret; + struct buffer_head *bh = NULL; + + ret = ocfs2_read_inode_block(inode, &bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_write_remove_suid(inode, bh); +out: + brelse(bh); + return ret; +} + +/* + * Allocate enough extents to cover the region starting at byte offset + * start for len bytes. Existing extents are skipped, any extents + * added are marked as "unwritten". + */ +static int ocfs2_allocate_unwritten_extents(struct inode *inode, + u64 start, u64 len) +{ + int ret; + u32 cpos, phys_cpos, clusters, alloc_size; + u64 end = start + len; + struct buffer_head *di_bh = NULL; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Nothing to do if the requested reservation range + * fits within the inode. + */ + if (ocfs2_size_fits_inline_data(di_bh, end)) + goto out; + + ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * We consider both start and len to be inclusive. + */ + cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len); + clusters -= cpos; + + while (clusters) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, + &alloc_size, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Hole or existing extent len can be arbitrary, so + * cap it to our own allocation request. + */ + if (alloc_size > clusters) + alloc_size = clusters; + + if (phys_cpos) { + /* + * We already have an allocation at this + * region so we can safely skip it. + */ + goto next; + } + + ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + +next: + cpos += alloc_size; + clusters -= alloc_size; + } + + ret = 0; +out: + + brelse(di_bh); + return ret; +} + +/* + * Truncate a byte range, avoiding pages within partial clusters. This + * preserves those pages for the zeroing code to write to. + */ +static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start, + u64 byte_len) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + loff_t start, end; + struct address_space *mapping = inode->i_mapping; + + start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start); + end = byte_start + byte_len; + end = end & ~(osb->s_clustersize - 1); + + if (start < end) { + unmap_mapping_range(mapping, start, end - start, 0); + truncate_inode_pages_range(mapping, start, end - 1); + } +} + +static int ocfs2_zero_partial_clusters(struct inode *inode, + u64 start, u64 len) +{ + int ret = 0; + u64 tmpend, end = start + len; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + unsigned int csize = osb->s_clustersize; + handle_t *handle; + + /* + * The "start" and "end" values are NOT necessarily part of + * the range whose allocation is being deleted. Rather, this + * is what the user passed in with the request. We must zero + * partial clusters here. There's no need to worry about + * physical allocation - the zeroing code knows to skip holes. + */ + mlog(0, "byte start: %llu, end: %llu\n", + (unsigned long long)start, (unsigned long long)end); + + /* + * If both edges are on a cluster boundary then there's no + * zeroing required as the region is part of the allocation to + * be truncated. + */ + if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0) + goto out; + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + /* + * We want to get the byte offset of the end of the 1st cluster. + */ + tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); + if (tmpend > end) + tmpend = end; + + mlog(0, "1st range: start: %llu, tmpend: %llu\n", + (unsigned long long)start, (unsigned long long)tmpend); + + ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); + if (ret) + mlog_errno(ret); + + if (tmpend < end) { + /* + * This may make start and end equal, but the zeroing + * code will skip any work in that case so there's no + * need to catch it up here. + */ + start = end & ~(osb->s_clustersize - 1); + + mlog(0, "2nd range: start: %llu, end: %llu\n", + (unsigned long long)start, (unsigned long long)end); + + ret = ocfs2_zero_range_for_truncate(inode, handle, start, end); + if (ret) + mlog_errno(ret); + } + + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +static int ocfs2_remove_inode_range(struct inode *inode, + struct buffer_head *di_bh, u64 byte_start, + u64 byte_len) +{ + int ret = 0; + u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_cached_dealloc_ctxt dealloc; + struct address_space *mapping = inode->i_mapping; + struct ocfs2_extent_tree et; + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + ocfs2_init_dealloc_ctxt(&dealloc); + + if (byte_len == 0) + return 0; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_truncate_inline(inode, di_bh, byte_start, + byte_start + byte_len, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + /* + * There's no need to get fancy with the page cache + * truncate of an inline-data inode. We're talking + * about less than a page here, which will be cached + * in the dinode buffer anyway. + */ + unmap_mapping_range(mapping, 0, 0, 0); + truncate_inode_pages(mapping, 0); + goto out; + } + + trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); + trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits; + if (trunc_len >= trunc_start) + trunc_len -= trunc_start; + else + trunc_len = 0; + + mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)byte_start, + (unsigned long long)byte_len, trunc_start, trunc_len); + + ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); + if (ret) { + mlog_errno(ret); + goto out; + } + + cpos = trunc_start; + while (trunc_len) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, + &alloc_size, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (alloc_size > trunc_len) + alloc_size = trunc_len; + + /* Only do work for non-holes */ + if (phys_cpos != 0) { + ret = ocfs2_remove_btree_range(inode, &et, cpos, + phys_cpos, alloc_size, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + cpos += alloc_size; + trunc_len -= alloc_size; + } + + ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); + +out: + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + + return ret; +} + +/* + * Parts of this function taken from xfs_change_file_space() + */ +static int __ocfs2_change_file_space(struct file *file, struct inode *inode, + loff_t f_pos, unsigned int cmd, + struct ocfs2_space_resv *sr, + int change_size) +{ + int ret; + s64 llen; + loff_t size; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + handle_t *handle; + unsigned long long max_off = inode->i_sb->s_maxbytes; + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + mutex_lock(&inode->i_mutex); + + /* + * This prevents concurrent writes on other nodes + */ + ret = ocfs2_rw_lock(inode, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_rw_unlock; + } + + if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { + ret = -EPERM; + goto out_inode_unlock; + } + + switch (sr->l_whence) { + case 0: /*SEEK_SET*/ + break; + case 1: /*SEEK_CUR*/ + sr->l_start += f_pos; + break; + case 2: /*SEEK_END*/ + sr->l_start += i_size_read(inode); + break; + default: + ret = -EINVAL; + goto out_inode_unlock; + } + sr->l_whence = 0; + + llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len; + + if (sr->l_start < 0 + || sr->l_start > max_off + || (sr->l_start + llen) < 0 + || (sr->l_start + llen) > max_off) { + ret = -EINVAL; + goto out_inode_unlock; + } + size = sr->l_start + sr->l_len; + + if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { + if (sr->l_len <= 0) { + ret = -EINVAL; + goto out_inode_unlock; + } + } + + if (file && should_remove_suid(file->f_path.dentry)) { + ret = __ocfs2_write_remove_suid(inode, di_bh); + if (ret) { + mlog_errno(ret); + goto out_inode_unlock; + } + } + + down_write(&OCFS2_I(inode)->ip_alloc_sem); + switch (cmd) { + case OCFS2_IOC_RESVSP: + case OCFS2_IOC_RESVSP64: + /* + * This takes unsigned offsets, but the signed ones we + * pass have been checked against overflow above. + */ + ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start, + sr->l_len); + break; + case OCFS2_IOC_UNRESVSP: + case OCFS2_IOC_UNRESVSP64: + ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start, + sr->l_len); + break; + default: + ret = -EINVAL; + } + up_write(&OCFS2_I(inode)->ip_alloc_sem); + if (ret) { + mlog_errno(ret); + goto out_inode_unlock; + } + + /* + * We update c/mtime for these changes + */ + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_inode_unlock; + } + + if (change_size && i_size_read(inode) < size) + i_size_write(inode, size); + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); + if (ret < 0) + mlog_errno(ret); + ocfs2_commit_trans(osb, handle); + +out_inode_unlock: + brelse(di_bh); + ocfs2_inode_unlock(inode, 1); +out_rw_unlock: + ocfs2_rw_unlock(inode, 1); + out: - mlog_exit(ret); + mutex_unlock(&inode->i_mutex); return ret; } -/* - * Will look for holes and unwritten extents in the range starting at - * pos for count bytes (inclusive). - */ -static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, - size_t count) +int ocfs2_change_file_space(struct file *file, unsigned int cmd, + struct ocfs2_space_resv *sr) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && + !ocfs2_writes_unwritten_extents(osb)) + return -ENOTTY; + else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) && + !ocfs2_sparse_alloc(osb)) + return -ENOTTY; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + + return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); +} + +static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset, + loff_t len) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_space_resv sr; + int change_size = 1; + + if (!ocfs2_writes_unwritten_extents(osb)) + return -EOPNOTSUPP; + + if (S_ISDIR(inode->i_mode)) + return -ENODEV; + + if (mode & FALLOC_FL_KEEP_SIZE) + change_size = 0; + + sr.l_whence = 0; + sr.l_start = (s64)offset; + sr.l_len = (s64)len; + + return __ocfs2_change_file_space(NULL, inode, offset, + OCFS2_IOC_RESVSP64, &sr, change_size); +} + +int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, + size_t count) { int ret = 0; unsigned int extent_flags; u32 cpos, clusters, extent_len, phys_cpos; struct super_block *sb = inode->i_sb; + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) || + !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) || + OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return 0; + cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; @@ -1142,7 +1727,7 @@ static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, goto out; } - if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { + if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) { ret = 1; break; } @@ -1157,24 +1742,49 @@ out: return ret; } +static int ocfs2_prepare_inode_for_refcount(struct inode *inode, + loff_t pos, size_t count, + int *meta_level) +{ + int ret; + struct buffer_head *di_bh = NULL; + u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + u32 clusters = + ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + *meta_level = 1; + + ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); + if (ret) + mlog_errno(ret); +out: + brelse(di_bh); + return ret; +} + static int ocfs2_prepare_inode_for_write(struct dentry *dentry, loff_t *ppos, size_t count, int appending, - int *direct_io) + int *direct_io, + int *has_refcount) { - int ret = 0, meta_level = appending; + int ret = 0, meta_level = 0; struct inode *inode = dentry->d_inode; - u32 clusters; - loff_t newsize, saved_pos; + loff_t saved_pos, end; /* - * We sample i_size under a read level meta lock to see if our write - * is extending the file, if it is we back off and get a write level - * meta lock. + * We start with a read level meta lock and only jump to an ex + * if we need to make modifications here. */ for(;;) { - ret = ocfs2_meta_lock(inode, NULL, meta_level); + ret = ocfs2_inode_lock(inode, NULL, meta_level); if (ret < 0) { meta_level = -1; mlog_errno(ret); @@ -1192,7 +1802,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, * set inode->i_size at the end of a write. */ if (should_remove_suid(dentry)) { if (meta_level == 0) { - ocfs2_meta_unlock(inode, meta_level); + ocfs2_inode_unlock(inode, meta_level); meta_level = 1; continue; } @@ -1213,87 +1823,69 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, saved_pos = *ppos; } - if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { - loff_t end = saved_pos + count; + end = saved_pos + count; - /* - * Skip the O_DIRECT checks if we don't need - * them. - */ - if (!direct_io || !(*direct_io)) - break; + ret = ocfs2_check_range_for_refcount(inode, saved_pos, count); + if (ret == 1) { + ocfs2_inode_unlock(inode, meta_level); + meta_level = -1; - /* - * Allowing concurrent direct writes means - * i_size changes wouldn't be synchronized, so - * one node could wind up truncating another - * nodes writes. - */ - if (end > i_size_read(inode)) { - *direct_io = 0; - break; - } + ret = ocfs2_prepare_inode_for_refcount(inode, + saved_pos, + count, + &meta_level); + if (has_refcount) + *has_refcount = 1; + } - /* - * We don't fill holes during direct io, so - * check for them here. If any are found, the - * caller will have to retake some cluster - * locks and initiate the io as buffered. - */ - ret = ocfs2_check_range_for_holes(inode, saved_pos, - count); - if (ret == 1) { - *direct_io = 0; - ret = 0; - } else if (ret < 0) - mlog_errno(ret); - break; + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; } /* - * The rest of this loop is concerned with legacy file - * systems which don't support sparse files. + * Skip the O_DIRECT checks if we don't need + * them. */ - - newsize = count + saved_pos; - - mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", - (long long) saved_pos, (long long) newsize, - (long long) i_size_read(inode)); - - /* No need for a higher level metadata lock if we're - * never going past i_size. */ - if (newsize <= i_size_read(inode)) + if (!direct_io || !(*direct_io)) break; - if (meta_level == 0) { - ocfs2_meta_unlock(inode, meta_level); - meta_level = 1; - continue; + /* + * There's no sane way to do direct writes to an inode + * with inline data. + */ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + *direct_io = 0; + break; } - spin_lock(&OCFS2_I(inode)->ip_lock); - clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - - OCFS2_I(inode)->ip_clusters; - spin_unlock(&OCFS2_I(inode)->ip_lock); - - mlog(0, "Writing at EOF, may need more allocation: " - "i_size = %lld, newsize = %lld, need %u clusters\n", - (long long) i_size_read(inode), (long long) newsize, - clusters); - - /* We only want to continue the rest of this loop if - * our extend will actually require more - * allocation. */ - if (!clusters) + if (has_refcount && *has_refcount == 1) { + *direct_io = 0; + break; + } + /* + * Allowing concurrent direct writes means + * i_size changes wouldn't be synchronized, so + * one node could wind up truncating another + * nodes writes. + */ + if (end > i_size_read(inode)) { + *direct_io = 0; break; - - ret = ocfs2_extend_file(inode, NULL, newsize, count); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto out_unlock; } + + /* + * We don't fill holes during direct io, so + * check for them here. If any are found, the + * caller will have to retake some cluster + * locks and initiate the io as buffered. + */ + ret = ocfs2_check_range_for_holes(inode, saved_pos, count); + if (ret == 1) { + *direct_io = 0; + ret = 0; + } else if (ret < 0) + mlog_errno(ret); break; } @@ -1301,165 +1893,28 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, *ppos = saved_pos; out_unlock: - ocfs2_meta_unlock(inode, meta_level); + if (meta_level >= 0) + ocfs2_inode_unlock(inode, meta_level); out: return ret; } -static inline void -ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) -{ - const struct iovec *iov = *iovp; - size_t base = *basep; - - do { - int copy = min(bytes, iov->iov_len - base); - - bytes -= copy; - base += copy; - if (iov->iov_len == base) { - iov++; - base = 0; - } - } while (bytes); - *iovp = iov; - *basep = base; -} - -static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, - const struct iovec *cur_iov, - size_t iov_offset) -{ - int ret; - char *buf; - struct page *src_page = NULL; - - buf = cur_iov->iov_base + iov_offset; - - if (!segment_eq(get_fs(), KERNEL_DS)) { - /* - * Pull in the user page. We want to do this outside - * of the meta data locks in order to preserve locking - * order in case of page fault. - */ - ret = get_user_pages(current, current->mm, - (unsigned long)buf & PAGE_CACHE_MASK, 1, - 0, 0, &src_page, NULL); - if (ret == 1) - bp->b_src_buf = kmap(src_page); - else - src_page = ERR_PTR(-EFAULT); - } else { - bp->b_src_buf = buf; - } - - return src_page; -} - -static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, - struct page *page) -{ - if (page) { - kunmap(page); - page_cache_release(page); - } -} - -static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, - const struct iovec *iov, - unsigned long nr_segs, - size_t count, - ssize_t o_direct_written) -{ - int ret = 0; - ssize_t copied, total = 0; - size_t iov_offset = 0; - const struct iovec *cur_iov = iov; - struct ocfs2_buffered_write_priv bp; - struct page *page; - - /* - * handle partial DIO write. Adjust cur_iov if needed. - */ - ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); - - do { - bp.b_cur_off = iov_offset; - bp.b_cur_iov = cur_iov; - - page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); - if (IS_ERR(page)) { - ret = PTR_ERR(page); - goto out; - } - - copied = ocfs2_buffered_write_cluster(file, *ppos, count, - ocfs2_map_and_write_user_data, - &bp); - - ocfs2_put_write_source(&bp, page); - - if (copied < 0) { - mlog_errno(copied); - ret = copied; - goto out; - } - - total += copied; - *ppos = *ppos + copied; - count -= copied; - - ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); - } while(count); - -out: - return total ? total : ret; -} - -static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted, - unsigned long *nr_segs) -{ - size_t ocount; /* original count */ - unsigned long seg; - - ocount = 0; - for (seg = 0; seg < *nr_segs; seg++) { - const struct iovec *iv = &iov[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - ocount += iv->iov_len; - if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) - return -EINVAL; - if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) - continue; - if (seg == 0) - return -EFAULT; - *nr_segs = seg; - ocount -= iv->iov_len; /* This segment is no good */ - break; - } - - *counted = ocount; - return 0; -} - static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { int ret, direct_io, appending, rw_level, have_alloc_sem = 0; - int can_do_direct, sync = 0; + int can_do_direct, has_refcount = 0; ssize_t written = 0; size_t ocount; /* original count */ size_t count; /* after file limit checks */ - loff_t *ppos = &iocb->ki_pos; + loff_t old_size, *ppos = &iocb->ki_pos; + u32 old_clusters; struct file *file = iocb->ki_filp; struct inode *inode = file->f_path.dentry->d_inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry("(0x%p, %u, '%.*s')\n", file, (unsigned int)nr_segs, @@ -1469,12 +1924,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, if (iocb->ki_left == 0) return 0; - ret = ocfs2_check_iovec(iov, &ocount, &nr_segs); - if (ret) - return ret; - - count = ocount; - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); appending = file->f_flags & O_APPEND ? 1 : 0; @@ -1500,7 +1949,7 @@ relock: can_do_direct = direct_io; ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, iocb->ki_left, appending, - &can_do_direct); + &can_do_direct, &has_refcount); if (ret < 0) { mlog_errno(ret); goto out; @@ -1518,54 +1967,72 @@ relock: rw_level = -1; direct_io = 0; - sync = 1; goto relock; } - if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) - sync = 1; - - /* - * XXX: Is it ok to execute these checks a second time? - */ - ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); - if (ret) - goto out; - /* - * Set pos so that sync_page_range_nolock() below understands - * where to start from. We might've moved it around via the - * calls above. The range we want to actually sync starts from - * *ppos here. - * + * To later detect whether a journal commit for sync writes is + * necessary, we sample i_size, and cluster count here. */ - pos = *ppos; + old_size = i_size_read(inode); + old_clusters = OCFS2_I(inode)->ip_clusters; /* communicate with ocfs2_dio_end_io */ - ocfs2_iocb_set_rw_locked(iocb); + ocfs2_iocb_set_rw_locked(iocb, rw_level); if (direct_io) { + ret = generic_segment_checks(iov, &nr_segs, &ocount, + VERIFY_READ); + if (ret) + goto out_dio; + + count = ocount; + ret = generic_write_checks(file, ppos, &count, + S_ISBLK(inode->i_mode)); + if (ret) + goto out_dio; + written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, ppos, count, ocount); if (written < 0) { + /* + * direct write may have instantiated a few + * blocks outside i_size. Trim these off again. + * Don't need i_size_read because we hold i_mutex. + */ + if (*ppos + count > inode->i_size) + vmtruncate(inode, inode->i_size); ret = written; goto out_dio; } } else { - written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs, - count, written); - if (written < 0) { - ret = written; - if (ret != -EFAULT || ret != -ENOSPC) - mlog_errno(ret); - goto out; - } + written = __generic_file_aio_write(iocb, iov, nr_segs, ppos); } out_dio: /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); + if ((file->f_flags & O_DSYNC && !direct_io) || IS_SYNC(inode) || + (file->f_flags & O_DIRECT && has_refcount)) { + ret = filemap_fdatawrite_range(file->f_mapping, pos, + pos + count - 1); + if (ret < 0) + written = ret; + + if (!ret && (old_size != i_size_read(inode) || + old_clusters != OCFS2_I(inode)->ip_clusters || + has_refcount)) { + ret = jbd2_journal_force_commit(osb->journal->j_journal); + if (ret < 0) + written = ret; + } + + if (!ret) + ret = filemap_fdatawait_range(file->f_mapping, pos, + pos + count - 1); + } + /* * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io * function pointer which is called when o_direct io completes so that @@ -1589,96 +2056,28 @@ out_sems: if (have_alloc_sem) up_read(&inode->i_alloc_sem); - if (written > 0 && sync) { - ssize_t err; - - err = sync_page_range_nolock(inode, file->f_mapping, pos, count); - if (err < 0) - written = err; - } - mutex_unlock(&inode->i_mutex); + if (written) + ret = written; mlog_exit(ret); - return written ? written : ret; -} - -static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, - struct pipe_buffer *buf, - struct splice_desc *sd) -{ - int ret, count, total = 0; - ssize_t copied = 0; - struct ocfs2_splice_write_priv sp; - - ret = buf->ops->pin(pipe, buf); - if (ret) - goto out; - - sp.s_sd = sd; - sp.s_buf = buf; - sp.s_pipe = pipe; - sp.s_offset = sd->pos & ~PAGE_CACHE_MASK; - sp.s_buf_offset = buf->offset; - - count = sd->len; - if (count + sp.s_offset > PAGE_CACHE_SIZE) - count = PAGE_CACHE_SIZE - sp.s_offset; - - do { - /* - * splice wants us to copy up to one page at a - * time. For pagesize > cluster size, this means we - * might enter ocfs2_buffered_write_cluster() more - * than once, so keep track of our progress here. - */ - copied = ocfs2_buffered_write_cluster(sd->file, - (loff_t)sd->pos + total, - count, - ocfs2_map_and_write_splice_data, - &sp); - if (copied < 0) { - mlog_errno(copied); - ret = copied; - goto out; - } - - count -= copied; - sp.s_offset += copied; - sp.s_buf_offset += copied; - total += copied; - } while (count); - - ret = 0; -out: - - return total ? total : ret; + return ret; } -static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, - struct file *out, - loff_t *ppos, - size_t len, - unsigned int flags) +static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, + struct file *out, + struct splice_desc *sd) { - int ret, err; - struct address_space *mapping = out->f_mapping; - struct inode *inode = mapping->host; - - ret = __splice_from_pipe(pipe, out, ppos, len, flags, - ocfs2_splice_write_actor); - if (ret > 0) { - *ppos += ret; + int ret; - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { - err = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - if (err) - ret = err; - } + ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, + sd->total_len, 0, NULL, NULL); + if (ret < 0) { + mlog_errno(ret); + return ret; } - return ret; + return splice_from_pipe_feed(pipe, sd, pipe_to_file); } static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, @@ -1688,35 +2087,61 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, unsigned int flags) { int ret; - struct inode *inode = out->f_path.dentry->d_inode; + struct address_space *mapping = out->f_mapping; + struct inode *inode = mapping->host; + struct splice_desc sd = { + .total_len = len, + .flags = flags, + .pos = *ppos, + .u.file = out, + }; mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, (unsigned int)len, out->f_path.dentry->d_name.len, out->f_path.dentry->d_name.name); - inode_double_lock(inode, pipe->inode); + if (pipe->inode) + mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); - ret = ocfs2_rw_lock(inode, 1); - if (ret < 0) { - mlog_errno(ret); - goto out; - } + splice_from_pipe_begin(&sd); + do { + ret = splice_from_pipe_next(pipe, &sd); + if (ret <= 0) + break; - ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, - NULL); - if (ret < 0) { - mlog_errno(ret); - goto out_unlock; - } + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + ret = ocfs2_rw_lock(inode, 1); + if (ret < 0) + mlog_errno(ret); + else { + ret = ocfs2_splice_to_file(pipe, out, &sd); + ocfs2_rw_unlock(inode, 1); + } + mutex_unlock(&inode->i_mutex); + } while (ret > 0); + splice_from_pipe_end(pipe, &sd); - /* ok, we're done with i_size and alloc work */ - ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); -out_unlock: - ocfs2_rw_unlock(inode, 1); -out: - inode_double_unlock(inode, pipe->inode); + if (sd.num_spliced) + ret = sd.num_spliced; + + if (ret > 0) { + unsigned long nr_pages; + int err; + + nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + err = generic_write_sync(out, *ppos, ret); + if (err) + ret = err; + else + *ppos += ret; + + balance_dirty_pages_ratelimited_nr(mapping, nr_pages); + } mlog_exit(ret); return ret; @@ -1728,7 +2153,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in, size_t len, unsigned int flags) { - int ret = 0; + int ret = 0, lock_level = 0; struct inode *inode = in->f_path.dentry->d_inode; mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, @@ -1739,12 +2164,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in, /* * See the comment in ocfs2_file_aio_read() */ - ret = ocfs2_meta_lock(inode, NULL, 0); + ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level); if (ret < 0) { mlog_errno(ret); goto bail; } - ocfs2_meta_unlock(inode, 0); + ocfs2_inode_unlock(inode, lock_level); ret = generic_file_splice_read(in, ppos, pipe, len, flags); @@ -1788,7 +2213,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, } rw_level = 0; /* communicate with ocfs2_dio_end_io */ - ocfs2_iocb_set_rw_locked(iocb); + ocfs2_iocb_set_rw_locked(iocb, rw_level); } /* @@ -1800,16 +2225,16 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, * like i_size. This allows the checks down below * generic_file_aio_read() a chance of actually working. */ - ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); + ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); if (ret < 0) { mlog_errno(ret); goto bail; } - ocfs2_meta_unlock(inode, lock_level); + ocfs2_inode_unlock(inode, lock_level); ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); if (ret == -EINVAL) - mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); + mlog(0, "generic_file_aio_read returned -EINVAL\n"); /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); @@ -1834,6 +2259,12 @@ const struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, + .fallocate = ocfs2_fallocate, + .fiemap = ocfs2_fiemap, }; const struct inode_operations ocfs2_special_file_iops = { @@ -1842,24 +2273,86 @@ const struct inode_operations ocfs2_special_file_iops = { .permission = ocfs2_permission, }; +/* + * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with + * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! + */ const struct file_operations ocfs2_fops = { + .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, - .sendfile = generic_file_sendfile, .mmap = ocfs2_mmap, .fsync = ocfs2_sync_file, .release = ocfs2_file_release, .open = ocfs2_file_open, .aio_read = ocfs2_file_aio_read, .aio_write = ocfs2_file_aio_write, - .ioctl = ocfs2_ioctl, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif + .lock = ocfs2_lock, + .flock = ocfs2_flock, .splice_read = ocfs2_file_splice_read, .splice_write = ocfs2_file_splice_write, }; const struct file_operations ocfs2_dops = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = ocfs2_readdir, + .fsync = ocfs2_sync_file, + .release = ocfs2_dir_release, + .open = ocfs2_dir_open, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif + .lock = ocfs2_lock, + .flock = ocfs2_flock, +}; + +/* + * POSIX-lockless variants of our file_operations. + * + * These will be used if the underlying cluster stack does not support + * posix file locking, if the user passes the "localflocks" mount + * option, or if we have a local-only fs. + * + * ocfs2_flock is in here because all stacks handle UNIX file locks, + * so we still want it in the case of no stack support for + * plocks. Internally, it will do the right thing when asked to ignore + * the cluster. + */ +const struct file_operations ocfs2_fops_no_plocks = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .mmap = ocfs2_mmap, + .fsync = ocfs2_sync_file, + .release = ocfs2_file_release, + .open = ocfs2_file_open, + .aio_read = ocfs2_file_aio_read, + .aio_write = ocfs2_file_aio_write, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif + .flock = ocfs2_flock, + .splice_read = ocfs2_file_splice_read, + .splice_write = ocfs2_file_splice_write, +}; + +const struct file_operations ocfs2_dops_no_plocks = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = ocfs2_readdir, .fsync = ocfs2_sync_file, - .ioctl = ocfs2_ioctl, + .release = ocfs2_dir_release, + .open = ocfs2_dir_open, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif + .flock = ocfs2_flock, };