X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=fs%2Focfs2%2Ffile.c;h=c2a87c885b73672db90e52aae9497c34842d1c65;hb=3a6b42cadc112b01daf0525e5fcd90bb333a5bb3;hp=b1ae4c754157e00ee34e3c7344eafe5c04e72369;hpb=7c08d70c69150148c14f02633855f1591219c37c;p=safe%2Fjmp%2Flinux-2.6 diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index b1ae4c7..c2a87c8 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -35,6 +35,7 @@ #include #include #include +#include #define MLOG_MASK_PREFIX ML_INODE #include @@ -51,9 +52,13 @@ #include "inode.h" #include "ioctl.h" #include "journal.h" +#include "locks.h" #include "mmap.h" #include "suballoc.h" #include "super.h" +#include "xattr.h" +#include "acl.h" +#include "quota.h" #include "buffer_head_io.h" @@ -63,6 +68,35 @@ static int ocfs2_sync_inode(struct inode *inode) return sync_mapping_buffers(inode->i_mapping); } +static int ocfs2_init_file_private(struct inode *inode, struct file *file) +{ + struct ocfs2_file_private *fp; + + fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL); + if (!fp) + return -ENOMEM; + + fp->fp_file = file; + mutex_init(&fp->fp_mutex); + ocfs2_file_lock_res_init(&fp->fp_flock, fp); + file->private_data = fp; + + return 0; +} + +static void ocfs2_free_file_private(struct inode *inode, struct file *file) +{ + struct ocfs2_file_private *fp = file->private_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (fp) { + ocfs2_simple_drop_lockres(osb, &fp->fp_flock); + ocfs2_lock_res_free(&fp->fp_flock); + kfree(fp); + file->private_data = NULL; + } +} + static int ocfs2_file_open(struct inode *inode, struct file *file) { int status; @@ -89,7 +123,18 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) oi->ip_open_count++; spin_unlock(&oi->ip_lock); - status = 0; + + status = ocfs2_init_file_private(inode, file); + if (status) { + /* + * We want to set open count back if we're failing the + * open. + */ + spin_lock(&oi->ip_lock); + oi->ip_open_count--; + spin_unlock(&oi->ip_lock); + } + leave: mlog_exit(status); return status; @@ -108,11 +153,24 @@ static int ocfs2_file_release(struct inode *inode, struct file *file) oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; spin_unlock(&oi->ip_lock); + ocfs2_free_file_private(inode, file); + mlog_exit(0); return 0; } +static int ocfs2_dir_open(struct inode *inode, struct file *file) +{ + return ocfs2_init_file_private(inode, file); +} + +static int ocfs2_dir_release(struct inode *inode, struct file *file) +{ + ocfs2_free_file_private(inode, file); + return 0; +} + static int ocfs2_sync_file(struct file *file, struct dentry *dentry, int datasync) @@ -130,7 +188,7 @@ static int ocfs2_sync_file(struct file *file, goto bail; journal = osb->journal->j_journal; - err = journal_force_commit(journal); + err = jbd2_journal_force_commit(journal); bail: mlog_exit(err); @@ -192,14 +250,14 @@ int ocfs2_update_inode_atime(struct inode *inode, mlog_entry_void(); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (handle == NULL) { - ret = -ENOMEM; + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; @@ -248,17 +306,17 @@ bail: return status; } -static int ocfs2_simple_size_update(struct inode *inode, - struct buffer_head *di_bh, - u64 new_i_size) +int ocfs2_simple_size_update(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size) { int ret; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); handle_t *handle = NULL; handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (handle == NULL) { - ret = -ENOMEM; + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); mlog_errno(ret); goto out; } @@ -295,8 +353,8 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, goto out; } - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_commit; @@ -314,7 +372,6 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, } i_size_write(inode, new_i_size); - inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); inode->i_ctime = inode->i_mtime = CURRENT_TIME; di = (struct ocfs2_dinode *) fe_bh->b_data; @@ -347,12 +404,9 @@ static int ocfs2_truncate_file(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)new_i_size); + /* We trust di_bh because it comes from ocfs2_inode_lock(), which + * already validated it */ fe = (struct ocfs2_dinode *) di_bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); - status = -EIO; - goto bail; - } mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), "Inode %llu, inode i_size = %lld != di " @@ -383,21 +437,25 @@ static int ocfs2_truncate_file(struct inode *inode, down_write(&OCFS2_I(inode)->ip_alloc_sem); - /* This forces other nodes to sync and drop their pages. Do - * this even if we have a truncate without allocation change - - * ocfs2 cluster sizes can be much greater than page size, so - * we have to truncate them anyway. */ - status = ocfs2_data_lock(inode, 1); - if (status < 0) { - up_write(&OCFS2_I(inode)->ip_alloc_sem); - - mlog_errno(status); - goto bail; - } - + /* + * The inode lock forced other nodes to sync and drop their + * pages, which (correctly) happens even if we have a truncate + * without allocation change - ocfs2 cluster sizes can be much + * greater than page size, so we have to truncate them + * anyway. + */ unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); truncate_inode_pages(inode->i_mapping, new_i_size); + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + status = ocfs2_truncate_inline(inode, di_bh, new_i_size, + i_size_read(inode), 1); + if (status) + mlog_errno(status); + + goto bail_unlock_sem; + } + /* alright, we're going to need to do a full blown alloc size * change. Orphan the inode so that recovery can complete the * truncate if necessary. This does the task of marking @@ -405,25 +463,23 @@ static int ocfs2_truncate_file(struct inode *inode, status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); if (status < 0) { mlog_errno(status); - goto bail_unlock_data; + goto bail_unlock_sem; } status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); if (status < 0) { mlog_errno(status); - goto bail_unlock_data; + goto bail_unlock_sem; } status = ocfs2_commit_truncate(osb, inode, di_bh, tc); if (status < 0) { mlog_errno(status); - goto bail_unlock_data; + goto bail_unlock_sem; } /* TODO: orphan dir cleanup here. */ -bail_unlock_data: - ocfs2_data_unlock(inode, 1); - +bail_unlock_sem: up_write(&OCFS2_I(inode)->ip_alloc_sem); bail: @@ -433,7 +489,7 @@ bail: } /* - * extend allocation only here. + * extend file allocation only here. * we'll update all the disk stuff, and oip->alloc_size * * expect stuff to be locked, a transaction started and enough data / @@ -442,189 +498,25 @@ bail: * Will return -EAGAIN, and a reason if a restart is needed. * If passed in, *reason will always be set, even in error. */ -int ocfs2_do_extend_allocation(struct ocfs2_super *osb, - struct inode *inode, - u32 *logical_offset, - u32 clusters_to_add, - int mark_unwritten, - struct buffer_head *fe_bh, - handle_t *handle, - struct ocfs2_alloc_context *data_ac, - struct ocfs2_alloc_context *meta_ac, - enum ocfs2_alloc_restarted *reason_ret) -{ - int status = 0; - int free_extents; - struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; - enum ocfs2_alloc_restarted reason = RESTART_NONE; - u32 bit_off, num_bits; - u64 block; - u8 flags = 0; - - BUG_ON(!clusters_to_add); - - if (mark_unwritten) - flags = OCFS2_EXT_UNWRITTEN; - - free_extents = ocfs2_num_free_extents(osb, inode, fe); - if (free_extents < 0) { - status = free_extents; - mlog_errno(status); - goto leave; - } - - /* there are two cases which could cause us to EAGAIN in the - * we-need-more-metadata case: - * 1) we haven't reserved *any* - * 2) we are so fragmented, we've needed to add metadata too - * many times. */ - if (!free_extents && !meta_ac) { - mlog(0, "we haven't reserved any metadata!\n"); - status = -EAGAIN; - reason = RESTART_META; - goto leave; - } else if ((!free_extents) - && (ocfs2_alloc_context_bits_left(meta_ac) - < ocfs2_extend_meta_needed(fe))) { - mlog(0, "filesystem is really fragmented...\n"); - status = -EAGAIN; - reason = RESTART_META; - goto leave; - } - - status = ocfs2_claim_clusters(osb, handle, data_ac, 1, - &bit_off, &num_bits); - if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); - goto leave; - } - - BUG_ON(num_bits > clusters_to_add); - - /* reserve our write early -- insert_extent may update the inode */ - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - block = ocfs2_clusters_to_blocks(osb->sb, bit_off); - mlog(0, "Allocating %u clusters at block %u for inode %llu\n", - num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_insert_extent(osb, handle, inode, fe_bh, - *logical_offset, block, num_bits, - flags, meta_ac); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - clusters_to_add -= num_bits; - *logical_offset += num_bits; - - if (clusters_to_add) { - mlog(0, "need to alloc once more, clusters = %u, wanted = " - "%u\n", fe->i_clusters, clusters_to_add); - status = -EAGAIN; - reason = RESTART_TRANS; - } - -leave: - mlog_exit(status); - if (reason_ret) - *reason_ret = reason; - return status; -} - -/* - * For a given allocation, determine which allocators will need to be - * accessed, and lock them, reserving the appropriate number of bits. - * - * Sparse file systems call this from ocfs2_write_begin_nolock() - * and ocfs2_allocate_unwritten_extents(). - * - * File systems which don't support holes call this from - * ocfs2_extend_allocation(). - */ -int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, - u32 clusters_to_add, u32 extents_to_split, - struct ocfs2_alloc_context **data_ac, - struct ocfs2_alloc_context **meta_ac) +int ocfs2_add_inode_data(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct buffer_head *fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret) { - int ret = 0, num_free_extents; - unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - - *meta_ac = NULL; - if (data_ac) - *data_ac = NULL; - - BUG_ON(clusters_to_add != 0 && data_ac == NULL); - - mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " - "clusters_to_add = %u, extents_to_split = %u\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), - le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split); - - num_free_extents = ocfs2_num_free_extents(osb, inode, di); - if (num_free_extents < 0) { - ret = num_free_extents; - mlog_errno(ret); - goto out; - } - - /* - * Sparse allocation file systems need to be more conservative - * with reserving room for expansion - the actual allocation - * happens while we've got a journal handle open so re-taking - * a cluster lock (because we ran out of room for another - * extent) will violate ordering rules. - * - * Most of the time we'll only be seeing this 1 cluster at a time - * anyway. - * - * Always lock for any unwritten extents - we might want to - * add blocks during a split. - */ - if (!num_free_extents || - (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { - ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto out; - } - } - - if (clusters_to_add == 0) - goto out; + int ret; + struct ocfs2_extent_tree et; - ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto out; - } - -out: - if (ret) { - if (*meta_ac) { - ocfs2_free_alloc_context(*meta_ac); - *meta_ac = NULL; - } - - /* - * We cannot have an error and a non null *data_ac. - */ - } + ocfs2_init_dinode_extent_tree(&et, inode, fe_bh); + ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset, + clusters_to_add, mark_unwritten, + &et, handle, + data_ac, meta_ac, reason_ret); return ret; } @@ -643,6 +535,8 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, struct ocfs2_alloc_context *meta_ac = NULL; enum ocfs2_alloc_restarted why; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_tree et; + int did_quota = 0; mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); @@ -652,31 +546,31 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, */ BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, - OCFS2_BH_CACHED, inode); + status = ocfs2_read_inode_block(inode, &bh); if (status < 0) { mlog_errno(status); goto leave; } - fe = (struct ocfs2_dinode *) bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); - status = -EIO; - goto leave; - } restart_all: BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); - status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac, - &meta_ac); + mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " + "clusters_to_add = %u\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters), + clusters_to_add); + ocfs2_init_dinode_extent_tree(&et, inode, bh); + status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, + &data_ac, &meta_ac); if (status) { mlog_errno(status); goto leave; } - credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); + credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list, + clusters_to_add); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); @@ -686,11 +580,18 @@ restart_all: } restarted_transaction: + if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, + clusters_to_add))) { + status = -EDQUOT; + goto leave; + } + did_quota = 1; + /* reserve a write to the file entry early on - that we if we * run out of credits in the allocation path, we can still * update i_size. */ - status = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; @@ -698,16 +599,16 @@ restarted_transaction: prev_clusters = OCFS2_I(inode)->ip_clusters; - status = ocfs2_do_extend_allocation(osb, - inode, - &logical_start, - clusters_to_add, - mark_unwritten, - bh, - handle, - data_ac, - meta_ac, - &why); + status = ocfs2_add_inode_data(osb, + inode, + &logical_start, + clusters_to_add, + mark_unwritten, + bh, + handle, + data_ac, + meta_ac, + &why); if ((status < 0) && (status != -EAGAIN)) { if (status != -ENOSPC) mlog_errno(status); @@ -723,6 +624,10 @@ restarted_transaction: spin_lock(&OCFS2_I(inode)->ip_lock); clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); spin_unlock(&OCFS2_I(inode)->ip_lock); + /* Release unused quota reservation */ + vfs_dq_free_space(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); + did_quota = 0; if (why != RESTART_NONE && clusters_to_add) { if (why == RESTART_META) { @@ -734,7 +639,7 @@ restarted_transaction: mlog(0, "restarting transaction.\n"); /* TODO: This can be more intelligent. */ credits = ocfs2_calc_extend_credits(osb->sb, - fe, + &fe->id2.i_list, clusters_to_add); status = ocfs2_extend_trans(handle, credits); if (status < 0) { @@ -752,9 +657,12 @@ restarted_transaction: le32_to_cpu(fe->i_clusters), (unsigned long long)le64_to_cpu(fe->i_size)); mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", - OCFS2_I(inode)->ip_clusters, i_size_read(inode)); + OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode)); leave: + if (status < 0 && did_quota) + vfs_dq_free_space(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); if (handle) { ocfs2_commit_trans(osb, handle); handle = NULL; @@ -771,38 +679,16 @@ leave: restart_func = 0; goto restart_all; } - if (bh) { - brelse(bh); - bh = NULL; - } + brelse(bh); + bh = NULL; mlog_exit(status); return status; } -static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, - u32 clusters_to_add, int mark_unwritten) -{ - int ret; - - /* - * The alloc sem blocks peope in read/write from reading our - * allocation until we're done changing it. We depend on - * i_mutex to block other extend/truncate calls while we're - * here. - */ - down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = __ocfs2_extend_allocation(inode, logical_start, clusters_to_add, - mark_unwritten); - up_write(&OCFS2_I(inode)->ip_alloc_sem); - - return ret; -} - /* Some parts of this taken from generic_cont_expand, which turned out * to be too fragile to do exactly what we need without us having to - * worry about recursive locking in ->prepare_write() and - * ->commit_write(). */ + * worry about recursive locking in ->write_begin() and ->write_end(). */ static int ocfs2_write_zero_page(struct inode *inode, u64 size) { @@ -890,25 +776,48 @@ out: return ret; } -/* - * A tail_to_skip value > 0 indicates that we're being called from - * ocfs2_file_aio_write(). This has the following implications: - * - * - we don't want to update i_size - * - di_bh will be NULL, which is fine because it's only used in the - * case where we want to update i_size. - * - ocfs2_zero_extend() will then only be filling the hole created - * between i_size and the start of the write. - */ +int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) +{ + int ret; + u32 clusters_to_add; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); + if (clusters_to_add < oi->ip_clusters) + clusters_to_add = 0; + else + clusters_to_add -= oi->ip_clusters; + + if (clusters_to_add) { + ret = __ocfs2_extend_allocation(inode, oi->ip_clusters, + clusters_to_add, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * Call this even if we don't add any clusters to the tree. We + * still need to zero the area between the old i_size and the + * new i_size. + */ + ret = ocfs2_zero_extend(inode, zero_to); + if (ret < 0) + mlog_errno(ret); + +out: + return ret; +} + static int ocfs2_extend_file(struct inode *inode, struct buffer_head *di_bh, - u64 new_i_size, - size_t tail_to_skip) + u64 new_i_size) { int ret = 0; - u32 clusters_to_add = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); - BUG_ON(!tail_to_skip && !di_bh); + BUG_ON(!di_bh); /* setattr sometimes calls us like this. */ if (new_i_size == 0) @@ -918,60 +827,60 @@ static int ocfs2_extend_file(struct inode *inode, goto out; BUG_ON(new_i_size < i_size_read(inode)); - if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { - BUG_ON(tail_to_skip != 0); + /* + * Fall through for converting inline data, even if the fs + * supports sparse files. + * + * The check for inline data here is legal - nobody can add + * the feature since we have i_mutex. We must check it again + * after acquiring ip_alloc_sem though, as paths like mmap + * might have raced us to converting the inode to extents. + */ + if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) + && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) goto out_update_size; - } - - clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - - OCFS2_I(inode)->ip_clusters; - /* - * protect the pages that ocfs2_zero_extend is going to be - * pulling into the page cache.. we do this before the - * metadata extend so that we don't get into the situation - * where we've extended the metadata but can't get the data - * lock to zero. + /* + * The alloc sem blocks people in read/write from reading our + * allocation until we're done changing it. We depend on + * i_mutex to block other extend/truncate calls while we're + * here. */ - ret = ocfs2_data_lock(inode, 1); - if (ret < 0) { - mlog_errno(ret); - goto out; - } + down_write(&oi->ip_alloc_sem); + + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + /* + * We can optimize small extends by keeping the inodes + * inline data. + */ + if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) { + up_write(&oi->ip_alloc_sem); + goto out_update_size; + } + + ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); + if (ret) { + up_write(&oi->ip_alloc_sem); - if (clusters_to_add) { - ret = ocfs2_extend_allocation(inode, - OCFS2_I(inode)->ip_clusters, - clusters_to_add, 0); - if (ret < 0) { mlog_errno(ret); - goto out_unlock; + goto out; } } - /* - * Call this even if we don't add any clusters to the tree. We - * still need to zero the area between the old i_size and the - * new i_size. - */ - ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip); + if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size); + + up_write(&oi->ip_alloc_sem); + if (ret < 0) { mlog_errno(ret); - goto out_unlock; + goto out; } out_update_size: - if (!tail_to_skip) { - /* We're being called from ocfs2_setattr() which wants - * us to update i_size */ - ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); - if (ret < 0) - mlog_errno(ret); - } - -out_unlock: - if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) - ocfs2_data_unlock(inode, 1); + ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); + if (ret < 0) + mlog_errno(ret); out: return ret; @@ -985,10 +894,17 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) struct ocfs2_super *osb = OCFS2_SB(sb); struct buffer_head *bh = NULL; handle_t *handle = NULL; + int locked[MAXQUOTAS] = {0, 0}; + int credits, qtype; + struct ocfs2_mem_dqinfo *oinfo; mlog_entry("(0x%p, '%.*s')\n", dentry, dentry->d_name.len, dentry->d_name.name); + /* ensuring we don't even attempt to truncate a symlink */ + if (S_ISLNK(inode->i_mode)) + attr->ia_valid &= ~ATTR_SIZE; + if (attr->ia_valid & ATTR_MODE) mlog(0, "mode change: %d\n", attr->ia_mode); if (attr->ia_valid & ATTR_UID) @@ -1020,7 +936,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } } - status = ocfs2_meta_lock(inode, &bh, 1); + status = ocfs2_inode_lock(inode, &bh, 1); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -1028,10 +944,21 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } if (size_change && attr->ia_size != i_size_read(inode)) { - if (i_size_read(inode) > attr->ia_size) + if (attr->ia_size > sb->s_maxbytes) { + status = -EFBIG; + goto bail_unlock; + } + + if (i_size_read(inode) > attr->ia_size) { + if (ocfs2_should_order_data(inode)) { + status = ocfs2_begin_ordered_truncate(inode, + attr->ia_size); + if (status) + goto bail_unlock; + } status = ocfs2_truncate_file(inode, bh, attr->ia_size); - else - status = ocfs2_extend_file(inode, bh, attr->ia_size, 0); + } else + status = ocfs2_extend_file(inode, bh, attr->ia_size); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -1040,11 +967,47 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } } - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto bail_unlock; + if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + credits = OCFS2_INODE_UPDATE_CREDITS; + if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid + && OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { + oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv; + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto bail_unlock; + credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) + + ocfs2_calc_qdel_credits(sb, USRQUOTA); + locked[USRQUOTA] = 1; + } + if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid + && OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { + oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv; + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto bail_unlock; + credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) + + ocfs2_calc_qdel_credits(sb, GRPQUOTA); + locked[GRPQUOTA] = 1; + } + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } + status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + if (status < 0) + goto bail_commit; + } else { + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } } /* @@ -1067,13 +1030,24 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: - ocfs2_meta_unlock(inode, 1); + for (qtype = 0; qtype < MAXQUOTAS; qtype++) { + if (!locked[qtype]) + continue; + oinfo = sb_dqinfo(sb, qtype)->dqi_priv; + ocfs2_unlock_global_qf(oinfo, 1); + } + ocfs2_inode_unlock(inode, 1); bail_unlock_rw: if (size_change) ocfs2_rw_unlock(inode, 1); bail: - if (bh) - brelse(bh); + brelse(bh); + + if (!status && attr->ia_valid & ATTR_MODE) { + status = ocfs2_acl_chmod(inode); + if (status < 0) + mlog_errno(status); + } mlog_exit(status); return status; @@ -1108,22 +1082,22 @@ bail: return err; } -int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) +int ocfs2_permission(struct inode *inode, int mask) { int ret; mlog_entry_void(); - ret = ocfs2_meta_lock(inode, NULL, 0); + ret = ocfs2_inode_lock(inode, NULL, 0); if (ret) { if (ret != -ENOENT) mlog_errno(ret); goto out; } - ret = generic_permission(inode, mask, NULL); + ret = generic_permission(inode, mask, ocfs2_check_acl); - ocfs2_meta_unlock(inode, 0); + ocfs2_inode_unlock(inode, 0); out: mlog_exit(ret); return ret; @@ -1141,14 +1115,14 @@ static int __ocfs2_write_remove_suid(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (handle == NULL) { - ret = -ENOMEM; + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out_trans; @@ -1214,10 +1188,8 @@ static int ocfs2_write_remove_suid(struct inode *inode) { int ret; struct buffer_head *bh = NULL; - struct ocfs2_inode_info *oi = OCFS2_I(inode); - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_inode_block(inode, &bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -1239,6 +1211,29 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode, { int ret; u32 cpos, phys_cpos, clusters, alloc_size; + u64 end = start + len; + struct buffer_head *di_bh = NULL; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Nothing to do if the requested reservation range + * fits within the inode. + */ + if (ocfs2_size_fits_inline_data(di_bh, end)) + goto out; + + ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } /* * We consider both start and len to be inclusive. @@ -1265,99 +1260,27 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode, if (phys_cpos) { /* * We already have an allocation at this - * region so we can safely skip it. - */ - goto next; - } - - ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); - if (ret) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto out; - } - -next: - cpos += alloc_size; - clusters -= alloc_size; - } - - ret = 0; -out: - return ret; -} - -static int __ocfs2_remove_inode_range(struct inode *inode, - struct buffer_head *di_bh, - u32 cpos, u32 phys_cpos, u32 len, - struct ocfs2_cached_dealloc_ctxt *dealloc) -{ - int ret; - u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct inode *tl_inode = osb->osb_tl_inode; - handle_t *handle; - struct ocfs2_alloc_context *meta_ac = NULL; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - - ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac); - if (ret) { - mlog_errno(ret); - return ret; - } - - mutex_lock(&tl_inode->i_mutex); - - if (ocfs2_truncate_log_needs_flush(osb)) { - ret = __ocfs2_flush_truncate_log(osb); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - } - - handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); - if (handle == NULL) { - ret = -ENOMEM; - mlog_errno(ret); - goto out; - } - - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } - - ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac, - dealloc); - if (ret) { - mlog_errno(ret); - goto out_commit; - } + * region so we can safely skip it. + */ + goto next; + } - OCFS2_I(inode)->ip_clusters -= len; - di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); + ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret) { - mlog_errno(ret); - goto out_commit; +next: + cpos += alloc_size; + clusters -= alloc_size; } - ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); - if (ret) - mlog_errno(ret); - -out_commit: - ocfs2_commit_trans(osb, handle); + ret = 0; out: - mutex_unlock(&tl_inode->i_mutex); - - if (meta_ac) - ocfs2_free_alloc_context(meta_ac); + brelse(di_bh); return ret; } @@ -1410,8 +1333,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, goto out; handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (handle == NULL) { - ret = -ENOMEM; + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); mlog_errno(ret); goto out; } @@ -1459,12 +1382,33 @@ static int ocfs2_remove_inode_range(struct inode *inode, u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_cached_dealloc_ctxt dealloc; + struct address_space *mapping = inode->i_mapping; + struct ocfs2_extent_tree et; + ocfs2_init_dinode_extent_tree(&et, inode, di_bh); ocfs2_init_dealloc_ctxt(&dealloc); if (byte_len == 0) return 0; + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_truncate_inline(inode, di_bh, byte_start, + byte_start + byte_len, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + /* + * There's no need to get fancy with the page cache + * truncate of an inline-data inode. We're talking + * about less than a page here, which will be cached + * in the dinode buffer anyway. + */ + unmap_mapping_range(mapping, 0, 0, 0); + truncate_inode_pages(mapping, 0); + goto out; + } + trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits; if (trunc_len >= trunc_start) @@ -1497,9 +1441,9 @@ static int ocfs2_remove_inode_range(struct inode *inode, /* Only do work for non-holes */ if (phys_cpos != 0) { - ret = __ocfs2_remove_inode_range(inode, di_bh, cpos, - phys_cpos, alloc_size, - &dealloc); + ret = ocfs2_remove_btree_range(inode, &et, cpos, + phys_cpos, alloc_size, + &dealloc); if (ret) { mlog_errno(ret); goto out; @@ -1549,7 +1493,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, goto out; } - ret = ocfs2_meta_lock(inode, &di_bh, 1); + ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret) { mlog_errno(ret); goto out_rw_unlock; @@ -1557,7 +1501,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { ret = -EPERM; - goto out_meta_unlock; + goto out_inode_unlock; } switch (sr->l_whence) { @@ -1571,7 +1515,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, break; default: ret = -EINVAL; - goto out_meta_unlock; + goto out_inode_unlock; } sr->l_whence = 0; @@ -1582,14 +1526,14 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, || (sr->l_start + llen) < 0 || (sr->l_start + llen) > max_off) { ret = -EINVAL; - goto out_meta_unlock; + goto out_inode_unlock; } size = sr->l_start + sr->l_len; if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { if (sr->l_len <= 0) { ret = -EINVAL; - goto out_meta_unlock; + goto out_inode_unlock; } } @@ -1597,7 +1541,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, ret = __ocfs2_write_remove_suid(inode, di_bh); if (ret) { mlog_errno(ret); - goto out_meta_unlock; + goto out_inode_unlock; } } @@ -1623,7 +1567,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, up_write(&OCFS2_I(inode)->ip_alloc_sem); if (ret) { mlog_errno(ret); - goto out_meta_unlock; + goto out_inode_unlock; } /* @@ -1633,7 +1577,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); - goto out_meta_unlock; + goto out_inode_unlock; } if (change_size && i_size_read(inode) < size) @@ -1646,14 +1590,14 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, ocfs2_commit_trans(osb, handle); -out_meta_unlock: +out_inode_unlock: brelse(di_bh); - ocfs2_meta_unlock(inode, 1); + ocfs2_inode_unlock(inode, 1); out_rw_unlock: ocfs2_rw_unlock(inode, 1); - mutex_unlock(&inode->i_mutex); out: + mutex_unlock(&inode->i_mutex); return ret; } @@ -1661,7 +1605,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, struct ocfs2_space_resv *sr) { struct inode *inode = file->f_path.dentry->d_inode; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && !ocfs2_writes_unwritten_extents(osb)) @@ -1709,18 +1653,16 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, int appending, int *direct_io) { - int ret = 0, meta_level = appending; + int ret = 0, meta_level = 0; struct inode *inode = dentry->d_inode; - u32 clusters; - loff_t newsize, saved_pos; + loff_t saved_pos, end; /* - * We sample i_size under a read level meta lock to see if our write - * is extending the file, if it is we back off and get a write level - * meta lock. + * We start with a read level meta lock and only jump to an ex + * if we need to make modifications here. */ for(;;) { - ret = ocfs2_meta_lock(inode, NULL, meta_level); + ret = ocfs2_inode_lock(inode, NULL, meta_level); if (ret < 0) { meta_level = -1; mlog_errno(ret); @@ -1738,7 +1680,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, * set inode->i_size at the end of a write. */ if (should_remove_suid(dentry)) { if (meta_level == 0) { - ocfs2_meta_unlock(inode, meta_level); + ocfs2_inode_unlock(inode, meta_level); meta_level = 1; continue; } @@ -1759,87 +1701,47 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, saved_pos = *ppos; } - if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { - loff_t end = saved_pos + count; - - /* - * Skip the O_DIRECT checks if we don't need - * them. - */ - if (!direct_io || !(*direct_io)) - break; - - /* - * Allowing concurrent direct writes means - * i_size changes wouldn't be synchronized, so - * one node could wind up truncating another - * nodes writes. - */ - if (end > i_size_read(inode)) { - *direct_io = 0; - break; - } + end = saved_pos + count; - /* - * We don't fill holes during direct io, so - * check for them here. If any are found, the - * caller will have to retake some cluster - * locks and initiate the io as buffered. - */ - ret = ocfs2_check_range_for_holes(inode, saved_pos, - count); - if (ret == 1) { - *direct_io = 0; - ret = 0; - } else if (ret < 0) - mlog_errno(ret); + /* + * Skip the O_DIRECT checks if we don't need + * them. + */ + if (!direct_io || !(*direct_io)) break; - } /* - * The rest of this loop is concerned with legacy file - * systems which don't support sparse files. + * There's no sane way to do direct writes to an inode + * with inline data. */ - - newsize = count + saved_pos; - - mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", - (long long) saved_pos, (long long) newsize, - (long long) i_size_read(inode)); - - /* No need for a higher level metadata lock if we're - * never going past i_size. */ - if (newsize <= i_size_read(inode)) + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + *direct_io = 0; break; - - if (meta_level == 0) { - ocfs2_meta_unlock(inode, meta_level); - meta_level = 1; - continue; } - spin_lock(&OCFS2_I(inode)->ip_lock); - clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - - OCFS2_I(inode)->ip_clusters; - spin_unlock(&OCFS2_I(inode)->ip_lock); - - mlog(0, "Writing at EOF, may need more allocation: " - "i_size = %lld, newsize = %lld, need %u clusters\n", - (long long) i_size_read(inode), (long long) newsize, - clusters); - - /* We only want to continue the rest of this loop if - * our extend will actually require more - * allocation. */ - if (!clusters) + /* + * Allowing concurrent direct writes means + * i_size changes wouldn't be synchronized, so + * one node could wind up truncating another + * nodes writes. + */ + if (end > i_size_read(inode)) { + *direct_io = 0; break; - - ret = ocfs2_extend_file(inode, NULL, newsize, count); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto out_unlock; } + + /* + * We don't fill holes during direct io, so + * check for them here. If any are found, the + * caller will have to retake some cluster + * locks and initiate the io as buffered. + */ + ret = ocfs2_check_range_for_holes(inode, saved_pos, count); + if (ret == 1) { + *direct_io = 0; + ret = 0; + } else if (ret < 0) + mlog_errno(ret); break; } @@ -1847,155 +1749,27 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, *ppos = saved_pos; out_unlock: - ocfs2_meta_unlock(inode, meta_level); + ocfs2_inode_unlock(inode, meta_level); out: return ret; } -static inline void -ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) -{ - const struct iovec *iov = *iovp; - size_t base = *basep; - - do { - int copy = min(bytes, iov->iov_len - base); - - bytes -= copy; - base += copy; - if (iov->iov_len == base) { - iov++; - base = 0; - } - } while (bytes); - *iovp = iov; - *basep = base; -} - -static struct page * ocfs2_get_write_source(char **ret_src_buf, - const struct iovec *cur_iov, - size_t iov_offset) -{ - int ret; - char *buf = cur_iov->iov_base + iov_offset; - struct page *src_page = NULL; - unsigned long off; - - off = (unsigned long)(buf) & ~PAGE_CACHE_MASK; - - if (!segment_eq(get_fs(), KERNEL_DS)) { - /* - * Pull in the user page. We want to do this outside - * of the meta data locks in order to preserve locking - * order in case of page fault. - */ - ret = get_user_pages(current, current->mm, - (unsigned long)buf & PAGE_CACHE_MASK, 1, - 0, 0, &src_page, NULL); - if (ret == 1) - *ret_src_buf = kmap(src_page) + off; - else - src_page = ERR_PTR(-EFAULT); - } else { - *ret_src_buf = buf; - } - - return src_page; -} - -static void ocfs2_put_write_source(struct page *page) -{ - if (page) { - kunmap(page); - page_cache_release(page); - } -} - -static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, - const struct iovec *iov, - unsigned long nr_segs, - size_t count, - ssize_t o_direct_written) -{ - int ret = 0; - ssize_t copied, total = 0; - size_t iov_offset = 0, bytes; - loff_t pos; - const struct iovec *cur_iov = iov; - struct page *user_page, *page; - char * uninitialized_var(buf); - char *dst; - void *fsdata; - - /* - * handle partial DIO write. Adjust cur_iov if needed. - */ - ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); - - do { - pos = *ppos; - - user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset); - if (IS_ERR(user_page)) { - ret = PTR_ERR(user_page); - goto out; - } - - /* Stay within our page boundaries */ - bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)), - (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK))); - /* Stay within the vector boundary */ - bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset); - /* Stay within count */ - bytes = min(bytes, count); - - page = NULL; - ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0, - &page, &fsdata); - if (ret) { - mlog_errno(ret); - goto out; - } - - dst = kmap_atomic(page, KM_USER0); - memcpy(dst + (pos & (loff_t)(PAGE_CACHE_SIZE - 1)), buf, bytes); - kunmap_atomic(dst, KM_USER0); - flush_dcache_page(page); - ocfs2_put_write_source(user_page); - - copied = ocfs2_write_end(file, file->f_mapping, pos, bytes, - bytes, page, fsdata); - if (copied < 0) { - mlog_errno(copied); - ret = copied; - goto out; - } - - total += copied; - *ppos = pos + copied; - count -= copied; - - ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); - } while(count); - -out: - return total ? total : ret; -} - static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { int ret, direct_io, appending, rw_level, have_alloc_sem = 0; - int can_do_direct, sync = 0; + int can_do_direct; ssize_t written = 0; size_t ocount; /* original count */ size_t count; /* after file limit checks */ - loff_t *ppos = &iocb->ki_pos; + loff_t old_size, *ppos = &iocb->ki_pos; + u32 old_clusters; struct file *file = iocb->ki_filp; struct inode *inode = file->f_path.dentry->d_inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry("(0x%p, %u, '%.*s')\n", file, (unsigned int)nr_segs, @@ -2005,12 +1779,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, if (iocb->ki_left == 0) return 0; - ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); - if (ret) - return ret; - - count = ocount; - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); appending = file->f_flags & O_APPEND ? 1 : 0; @@ -2054,54 +1822,67 @@ relock: rw_level = -1; direct_io = 0; - sync = 1; goto relock; } - if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) - sync = 1; - - /* - * XXX: Is it ok to execute these checks a second time? - */ - ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); - if (ret) - goto out; - /* - * Set pos so that sync_page_range_nolock() below understands - * where to start from. We might've moved it around via the - * calls above. The range we want to actually sync starts from - * *ppos here. - * + * To later detect whether a journal commit for sync writes is + * necessary, we sample i_size, and cluster count here. */ - pos = *ppos; + old_size = i_size_read(inode); + old_clusters = OCFS2_I(inode)->ip_clusters; /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_rw_locked(iocb, rw_level); if (direct_io) { + ret = generic_segment_checks(iov, &nr_segs, &ocount, + VERIFY_READ); + if (ret) + goto out_dio; + + ret = generic_write_checks(file, ppos, &count, + S_ISBLK(inode->i_mode)); + if (ret) + goto out_dio; + written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, ppos, count, ocount); if (written < 0) { + /* + * direct write may have instantiated a few + * blocks outside i_size. Trim these off again. + * Don't need i_size_read because we hold i_mutex. + */ + if (*ppos + count > inode->i_size) + vmtruncate(inode, inode->i_size); ret = written; goto out_dio; } } else { - written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs, - count, written); - if (written < 0) { - ret = written; - if (ret != -EFAULT || ret != -ENOSPC) - mlog_errno(ret); - goto out; - } + written = generic_file_aio_write_nolock(iocb, iov, nr_segs, + *ppos); } out_dio: /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); + if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) { + /* + * The generic write paths have handled getting data + * to disk, but since we don't make use of the dirty + * inode list, a manual journal commit is necessary + * here. + */ + if (old_size != i_size_read(inode) || + old_clusters != OCFS2_I(inode)->ip_clusters) { + ret = jbd2_journal_force_commit(osb->journal->j_journal); + if (ret < 0) + written = ret; + } + } + /* * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io * function pointer which is called when o_direct io completes so that @@ -2125,73 +1906,35 @@ out_sems: if (have_alloc_sem) up_read(&inode->i_alloc_sem); - if (written > 0 && sync) { - ssize_t err; - - err = sync_page_range_nolock(inode, file->f_mapping, pos, count); - if (err < 0) - written = err; - } - mutex_unlock(&inode->i_mutex); mlog_exit(ret); return written ? written : ret; } -static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, - struct pipe_buffer *buf, - struct splice_desc *sd) +static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, + struct file *out, + struct splice_desc *sd) { - int ret, count; - ssize_t copied = 0; - struct file *file = sd->u.file; - unsigned int offset; - struct page *page = NULL; - void *fsdata; - char *src, *dst; - - ret = buf->ops->confirm(pipe, buf); - if (ret) - goto out; - - offset = sd->pos & ~PAGE_CACHE_MASK; - count = sd->len; - if (count + offset > PAGE_CACHE_SIZE) - count = PAGE_CACHE_SIZE - offset; + int ret; - ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0, - &page, &fsdata); - if (ret) { + ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, + sd->total_len, 0, NULL); + if (ret < 0) { mlog_errno(ret); - goto out; - } - - src = buf->ops->map(pipe, buf, 1); - dst = kmap_atomic(page, KM_USER1); - memcpy(dst + offset, src + buf->offset, count); - kunmap_atomic(dst, KM_USER1); - buf->ops->unmap(pipe, buf, src); - - copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count, - page, fsdata); - if (copied < 0) { - mlog_errno(copied); - ret = copied; - goto out; + return ret; } -out: - return copied ? copied : ret; + return splice_from_pipe_feed(pipe, sd, pipe_to_file); } -static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, - struct file *out, - loff_t *ppos, - size_t len, - unsigned int flags) +static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, + loff_t *ppos, + size_t len, + unsigned int flags) { - int ret, err; + int ret; struct address_space *mapping = out->f_mapping; struct inode *inode = mapping->host; struct splice_desc sd = { @@ -2201,57 +1944,67 @@ static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, .u.file = out, }; - ret = __splice_from_pipe(pipe, &sd, ocfs2_splice_write_actor); - if (ret > 0) { - *ppos += ret; - - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { - err = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - if (err) - ret = err; - } - } - - return ret; -} - -static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, - struct file *out, - loff_t *ppos, - size_t len, - unsigned int flags) -{ - int ret; - struct inode *inode = out->f_path.dentry->d_inode; - mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, (unsigned int)len, out->f_path.dentry->d_name.len, out->f_path.dentry->d_name.name); - inode_double_lock(inode, pipe->inode); + if (pipe->inode) + mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); - ret = ocfs2_rw_lock(inode, 1); - if (ret < 0) { - mlog_errno(ret); - goto out; - } + splice_from_pipe_begin(&sd); + do { + ret = splice_from_pipe_next(pipe, &sd); + if (ret <= 0) + break; - ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, - NULL); - if (ret < 0) { - mlog_errno(ret); - goto out_unlock; - } + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + ret = ocfs2_rw_lock(inode, 1); + if (ret < 0) + mlog_errno(ret); + else { + ret = ocfs2_splice_to_file(pipe, out, &sd); + ocfs2_rw_unlock(inode, 1); + } + mutex_unlock(&inode->i_mutex); + } while (ret > 0); + splice_from_pipe_end(pipe, &sd); - /* ok, we're done with i_size and alloc work */ - ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); -out_unlock: - ocfs2_rw_unlock(inode, 1); -out: - inode_double_unlock(inode, pipe->inode); + if (sd.num_spliced) + ret = sd.num_spliced; + + if (ret > 0) { + unsigned long nr_pages; + + *ppos += ret; + nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + /* + * If file or inode is SYNC and we actually wrote some data, + * sync it. + */ + if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { + int err; + + mutex_lock(&inode->i_mutex); + err = ocfs2_rw_lock(inode, 1); + if (err < 0) { + mlog_errno(err); + } else { + err = generic_osync_inode(inode, mapping, + OSYNC_METADATA|OSYNC_DATA); + ocfs2_rw_unlock(inode, 1); + } + mutex_unlock(&inode->i_mutex); + + if (err) + ret = err; + } + balance_dirty_pages_ratelimited_nr(mapping, nr_pages); + } mlog_exit(ret); return ret; @@ -2274,12 +2027,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in, /* * See the comment in ocfs2_file_aio_read() */ - ret = ocfs2_meta_lock(inode, NULL, 0); + ret = ocfs2_inode_lock(inode, NULL, 0); if (ret < 0) { mlog_errno(ret); goto bail; } - ocfs2_meta_unlock(inode, 0); + ocfs2_inode_unlock(inode, 0); ret = generic_file_splice_read(in, ppos, pipe, len, flags); @@ -2335,16 +2088,16 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, * like i_size. This allows the checks down below * generic_file_aio_read() a chance of actually working. */ - ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); + ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); if (ret < 0) { mlog_errno(ret); goto bail; } - ocfs2_meta_unlock(inode, lock_level); + ocfs2_inode_unlock(inode, lock_level); ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); if (ret == -EINVAL) - mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); + mlog(0, "generic_file_aio_read returned -EINVAL\n"); /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); @@ -2369,7 +2122,12 @@ const struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, .fallocate = ocfs2_fallocate, + .fiemap = ocfs2_fiemap, }; const struct inode_operations ocfs2_special_file_iops = { @@ -2378,7 +2136,12 @@ const struct inode_operations ocfs2_special_file_iops = { .permission = ocfs2_permission, }; +/* + * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with + * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! + */ const struct file_operations ocfs2_fops = { + .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, .mmap = ocfs2_mmap, @@ -2387,20 +2150,72 @@ const struct file_operations ocfs2_fops = { .open = ocfs2_file_open, .aio_read = ocfs2_file_aio_read, .aio_write = ocfs2_file_aio_write, - .ioctl = ocfs2_ioctl, + .unlocked_ioctl = ocfs2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, #endif + .lock = ocfs2_lock, + .flock = ocfs2_flock, .splice_read = ocfs2_file_splice_read, .splice_write = ocfs2_file_splice_write, }; const struct file_operations ocfs2_dops = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = ocfs2_readdir, + .fsync = ocfs2_sync_file, + .release = ocfs2_dir_release, + .open = ocfs2_dir_open, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif + .lock = ocfs2_lock, + .flock = ocfs2_flock, +}; + +/* + * POSIX-lockless variants of our file_operations. + * + * These will be used if the underlying cluster stack does not support + * posix file locking, if the user passes the "localflocks" mount + * option, or if we have a local-only fs. + * + * ocfs2_flock is in here because all stacks handle UNIX file locks, + * so we still want it in the case of no stack support for + * plocks. Internally, it will do the right thing when asked to ignore + * the cluster. + */ +const struct file_operations ocfs2_fops_no_plocks = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .mmap = ocfs2_mmap, + .fsync = ocfs2_sync_file, + .release = ocfs2_file_release, + .open = ocfs2_file_open, + .aio_read = ocfs2_file_aio_read, + .aio_write = ocfs2_file_aio_write, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif + .flock = ocfs2_flock, + .splice_read = ocfs2_file_splice_read, + .splice_write = ocfs2_file_splice_write, +}; + +const struct file_operations ocfs2_dops_no_plocks = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = ocfs2_readdir, .fsync = ocfs2_sync_file, - .ioctl = ocfs2_ioctl, + .release = ocfs2_dir_release, + .open = ocfs2_dir_open, + .unlocked_ioctl = ocfs2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, #endif + .flock = ocfs2_flock, };