X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=fs%2Focfs2%2Faops.c;h=b401654011a2b64c4e55f87c7916e3badc07f68e;hb=bcc24fb42585dc9f490cf7789a917358414bdab5;hp=b74971e19d54cf58744b53e0a58aa6474b29954d;hpb=8110b073a9135acf0a71bccfc20c0d1023f179c6;p=safe%2Fjmp%2Flinux-2.6 diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index b74971e..b401654 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include #define MLOG_MASK_PREFIX ML_FILE_IO #include @@ -67,21 +69,13 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, goto bail; } - status = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, - &bh, OCFS2_BH_CACHED, inode); + status = ocfs2_read_inode_block(inode, &bh); if (status < 0) { mlog_errno(status); goto bail; } fe = (struct ocfs2_dinode *) bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", - (unsigned long long)fe->i_blkno, 7, fe->i_signature); - goto bail; - } - if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, le32_to_cpu(fe->i_clusters))) { mlog(ML_ERROR, "block offset is outside the allocated size: " @@ -126,8 +120,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, err = 0; bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(err); return err; @@ -138,7 +131,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, { int err = 0; unsigned int ext_flags; - u64 p_blkno, past_eof; + u64 max_blocks = bh_result->b_size >> inode->i_blkbits; + u64 p_blkno, count, past_eof; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, @@ -154,7 +148,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, goto bail; } - err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, + err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count, &ext_flags); if (err) { mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " @@ -163,20 +157,32 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, goto bail; } + if (max_blocks < count) + count = max_blocks; + /* * ocfs2 never allocates in this function - the only time we * need to use BH_New is when we're extending i_size on a file * system which doesn't support holes, in which case BH_New * allows block_prepare_write() to zero. + * + * If we see this on a sparse file system, then a truncate has + * raced us and removed the cluster. In this case, we clear + * the buffers dirty and uptodate bits and let the buffer code + * ignore it as a hole. */ - mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb), - "ino %lu, iblock %llu\n", inode->i_ino, - (unsigned long long)iblock); + if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) { + clear_buffer_dirty(bh_result); + clear_buffer_uptodate(bh_result); + goto bail; + } /* Treat the unwritten extent as a hole for zeroing purposes. */ if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) map_bh(bh_result, inode->i_sb, p_blkno); + bh_result->b_size = count << inode->i_blkbits; + if (!ocfs2_sparse_alloc(osb)) { if (p_blkno == 0) { err = -EIO; @@ -187,6 +193,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, (unsigned long long)OCFS2_I(inode)->ip_blkno); mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); dump_stack(); + goto bail; } past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); @@ -205,15 +212,75 @@ bail: return err; } +int ocfs2_read_inline_data(struct inode *inode, struct page *page, + struct buffer_head *di_bh) +{ + void *kaddr; + loff_t size; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { + ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + return -EROFS; + } + + size = i_size_read(inode); + + if (size > PAGE_CACHE_SIZE || + size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { + ocfs2_error(inode->i_sb, + "Inode %llu has with inline data has bad size: %Lu", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)size); + return -EROFS; + } + + kaddr = kmap_atomic(page, KM_USER0); + if (size) + memcpy(kaddr, di->id2.i_data.id_data, size); + /* Clear the remaining part of the page */ + memset(kaddr + size, 0, PAGE_CACHE_SIZE - size); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + + SetPageUptodate(page); + + return 0; +} + +static int ocfs2_readpage_inline(struct inode *inode, struct page *page) +{ + int ret; + struct buffer_head *di_bh = NULL; + + BUG_ON(!PageLocked(page)); + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_inline_data(inode, page, di_bh); +out: + unlock_page(page); + + brelse(di_bh); + return ret; +} + static int ocfs2_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; + struct ocfs2_inode_info *oi = OCFS2_I(inode); loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; int ret, unlock = 1; mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); - ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); + ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page); if (ret != 0) { if (ret == AOP_TRUNCATED_PAGE) unlock = 0; @@ -221,43 +288,38 @@ static int ocfs2_readpage(struct file *file, struct page *page) goto out; } - down_read(&OCFS2_I(inode)->ip_alloc_sem); + if (down_read_trylock(&oi->ip_alloc_sem) == 0) { + ret = AOP_TRUNCATED_PAGE; + goto out_inode_unlock; + } /* * i_size might have just been updated as we grabed the meta lock. We * might now be discovering a truncate that hit on another node. * block_read_full_page->get_block freaks out if it is asked to read * beyond the end of a file, so we check here. Callers - * (generic_file_read, fault->nopage) are clever enough to check i_size + * (generic_file_read, vm_ops->fault) are clever enough to check i_size * and notice that the page they just read isn't needed. * * XXX sys_readahead() seems to get that wrong? */ if (start >= i_size_read(inode)) { - char *addr = kmap(page); - memset(addr, 0, PAGE_SIZE); - flush_dcache_page(page); - kunmap(page); + zero_user(page, 0, PAGE_SIZE); SetPageUptodate(page); ret = 0; goto out_alloc; } - ret = ocfs2_data_lock_with_page(inode, 0, page); - if (ret != 0) { - if (ret == AOP_TRUNCATED_PAGE) - unlock = 0; - mlog_errno(ret); - goto out_alloc; - } - - ret = block_read_full_page(page, ocfs2_get_block); + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) + ret = ocfs2_readpage_inline(inode, page); + else + ret = block_read_full_page(page, ocfs2_get_block); unlock = 0; - ocfs2_data_unlock(inode, 0); out_alloc: up_read(&OCFS2_I(inode)->ip_alloc_sem); - ocfs2_meta_unlock(inode, 0); +out_inode_unlock: + ocfs2_inode_unlock(inode, 0); out: if (unlock) unlock_page(page); @@ -265,6 +327,62 @@ out: return ret; } +/* + * This is used only for read-ahead. Failures or difficult to handle + * situations are safe to ignore. + * + * Right now, we don't bother with BH_Boundary - in-inode extent lists + * are quite large (243 extents on 4k blocks), so most inodes don't + * grow out to a tree. If need be, detecting boundary extents could + * trivially be added in a future version of ocfs2_get_block(). + */ +static int ocfs2_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + int ret, err = -EIO; + struct inode *inode = mapping->host; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + loff_t start; + struct page *last; + + /* + * Use the nonblocking flag for the dlm code to avoid page + * lock inversion, but don't bother with retrying. + */ + ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK); + if (ret) + return err; + + if (down_read_trylock(&oi->ip_alloc_sem) == 0) { + ocfs2_inode_unlock(inode, 0); + return err; + } + + /* + * Don't bother with inline-data. There isn't anything + * to read-ahead in that case anyway... + */ + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) + goto out_unlock; + + /* + * Check whether a remote node truncated this file - we just + * drop out in that case as it's not worth handling here. + */ + last = list_entry(pages->prev, struct page, lru); + start = (loff_t)last->index << PAGE_CACHE_SHIFT; + if (start >= i_size_read(inode)) + goto out_unlock; + + err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block); + +out_unlock: + up_read(&oi->ip_alloc_sem); + ocfs2_inode_unlock(inode, 0); + + return err; +} + /* Note: Because we don't support holes, our allocation has * already happened (allocation writes zeros to the file data) * so we don't have to worry about ordered writes in @@ -299,12 +417,8 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, { int ret; - down_read(&OCFS2_I(inode)->ip_alloc_sem); - ret = block_prepare_write(page, from, to, ocfs2_get_block); - up_read(&OCFS2_I(inode)->ip_alloc_sem); - return ret; } @@ -350,27 +464,24 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, unsigned to) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - handle_t *handle = NULL; + handle_t *handle; int ret = 0; handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (!handle) { + if (IS_ERR(handle)) { ret = -ENOMEM; mlog_errno(ret); goto out; } if (ocfs2_should_order_data(inode)) { - ret = walk_page_buffers(handle, - page_buffers(page), - from, to, NULL, - ocfs2_journal_dirty_data); - if (ret < 0) + ret = ocfs2_jbd2_file_inode(handle, inode); + if (ret < 0) mlog_errno(ret); } out: if (ret) { - if (handle) + if (!IS_ERR(handle)) ocfs2_commit_trans(osb, handle); handle = ERR_PTR(ret); } @@ -390,7 +501,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) * accessed concurrently from multiple nodes. */ if (!INODE_JOURNAL(inode)) { - err = ocfs2_meta_lock(inode, NULL, 0); + err = ocfs2_inode_lock(inode, NULL, 0); if (err) { if (err != -ENOENT) mlog_errno(err); @@ -399,11 +510,13 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) down_read(&OCFS2_I(inode)->ip_alloc_sem); } - err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL); + if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) + err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, + NULL); if (!INODE_JOURNAL(inode)) { up_read(&OCFS2_I(inode)->ip_alloc_sem); - ocfs2_meta_unlock(inode, 0); + ocfs2_inode_unlock(inode, 0); } if (err) { @@ -413,7 +526,6 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) goto bail; } - bail: status = err ? 0 : p_blkno; @@ -469,7 +581,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, goto bail; } - if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { + if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) { ocfs2_error(inode->i_sb, "Inode %llu has a hole at block %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -522,12 +634,17 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, void *private) { struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + int level; /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); + ocfs2_iocb_clear_rw_locked(iocb); - up_read(&inode->i_alloc_sem); - ocfs2_rw_unlock(inode, 0); + + level = ocfs2_iocb_rw_locked_level(iocb); + if (!level) + up_read(&inode->i_alloc_sem); + ocfs2_rw_unlock(inode, level); } /* @@ -539,7 +656,7 @@ static void ocfs2_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; - journal_invalidatepage(journal, page, offset); + jbd2_journal_invalidatepage(journal, page, offset); } static int ocfs2_releasepage(struct page *page, gfp_t wait) @@ -548,7 +665,7 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait) if (!page_has_buffers(page)) return 0; - return journal_try_to_free_buffers(journal, page, wait); + return jbd2_journal_try_to_free_buffers(journal, page, wait); } static ssize_t ocfs2_direct_IO(int rw, @@ -563,34 +680,19 @@ static ssize_t ocfs2_direct_IO(int rw, mlog_entry_void(); - if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { - /* - * We get PR data locks even for O_DIRECT. This - * allows concurrent O_DIRECT I/O but doesn't let - * O_DIRECT with extending and buffered zeroing writes - * race. If they did race then the buffered zeroing - * could be written back after the O_DIRECT I/O. It's - * one thing to tell people not to mix buffered and - * O_DIRECT writes, but expecting them to understand - * that file extension is also an implicit buffered - * write is too much. By getting the PR we force - * writeback of the buffered zeroing before - * proceeding. - */ - ret = ocfs2_data_lock(inode, 0); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - ocfs2_data_unlock(inode, 0); - } + /* + * Fallback to buffered I/O if we see an inode without + * extents. + */ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return 0; ret = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ocfs2_direct_IO_get_blocks, ocfs2_dio_end_io); -out: + mlog_exit(ret); return ret; } @@ -654,6 +756,27 @@ static void ocfs2_clear_page_regions(struct page *page, } /* + * Nonsparse file systems fully allocate before we get to the write + * code. This prevents ocfs2_write() from tagging the write as an + * allocating one, which means ocfs2_map_page_blocks() might try to + * read-in the blocks at the tail of our file. Avoid reading them by + * testing i_size against each block offset. + */ +static int ocfs2_should_read_blk(struct inode *inode, struct page *page, + unsigned int block_start) +{ + u64 offset = page_offset(page) + block_start; + + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + return 1; + + if (i_size_read(inode) > offset) + return 1; + + return 0; +} + +/* * Some of this taken from block_prepare_write(). We already have our * mapping by now though, and the entire write will be allocating or * it won't, so not much need to use BH_New. @@ -677,6 +800,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, bh = bh->b_this_page, block_start += bsize) { block_end = block_start + bsize; + clear_buffer_new(bh); + /* * Ignore blocks outside of our i/o range - * they may belong to unallocated clusters. @@ -691,9 +816,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, * For an allocating write with cluster size >= page * size, we always write the entire page. */ - - if (buffer_new(bh)) - clear_buffer_new(bh); + if (new) + set_buffer_new(bh); if (!buffer_mapped(bh)) { map_bh(bh, inode->i_sb, *p_blkno); @@ -704,7 +828,9 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && - (block_start < from || block_end > to)) { + !buffer_new(bh) && + ocfs2_should_read_blk(inode, page, block_start) && + (block_start < from || block_end > to)) { ll_rw_block(READ, 1, &bh); *wait_bh++=bh; } @@ -731,18 +857,13 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, bh = head; block_start = 0; do { - void *kaddr; - block_end = block_start + bsize; if (block_end <= from) goto next_bh; if (block_start >= to) break; - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr+block_start, 0, bh->b_size); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); + zero_user(page, block_start, bh->b_size); set_buffer_uptodate(bh); mark_buffer_dirty(bh); @@ -754,216 +875,235 @@ next_bh: return ret; } +#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE) +#define OCFS2_MAX_CTXT_PAGES 1 +#else +#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE) +#endif + +#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) + /* - * This will copy user data from the buffer page in the splice - * context. - * - * For now, we ignore SPLICE_F_MOVE as that would require some extra - * communication out all the way to ocfs2_write(). + * Describe the state of a single cluster to be written to. */ -int ocfs2_map_and_write_splice_data(struct inode *inode, - struct ocfs2_write_ctxt *wc, u64 *p_blkno, - unsigned int *ret_from, unsigned int *ret_to) -{ - int ret; - unsigned int to, from, cluster_start, cluster_end; - char *src, *dst; - struct ocfs2_splice_write_priv *sp = wc->w_private; - struct pipe_buffer *buf = sp->s_buf; - unsigned long bytes, src_from; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); +struct ocfs2_write_cluster_desc { + u32 c_cpos; + u32 c_phys; + /* + * Give this a unique field because c_phys eventually gets + * filled. + */ + unsigned c_new; + unsigned c_unwritten; + unsigned c_needs_zero; +}; - ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, - &cluster_end); +struct ocfs2_write_ctxt { + /* Logical cluster position / len of write */ + u32 w_cpos; + u32 w_clen; - from = sp->s_offset; - src_from = sp->s_buf_offset; - bytes = wc->w_count; + /* First cluster allocated in a nonsparse extend */ + u32 w_first_new_cpos; - if (wc->w_large_pages) { - /* - * For cluster size < page size, we have to - * calculate pos within the cluster and obey - * the rightmost boundary. - */ - bytes = min(bytes, (unsigned long)(osb->s_clustersize - - (wc->w_pos & (osb->s_clustersize - 1)))); - } - to = from + bytes; + struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; - if (wc->w_this_page_new) - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, - cluster_start, cluster_end, 1); - else - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, - from, to, 0); - if (ret) { - mlog_errno(ret); - goto out; + /* + * This is true if page_size > cluster_size. + * + * It triggers a set of special cases during write which might + * have to deal with allocating writes to partial pages. + */ + unsigned int w_large_pages; + + /* + * Pages involved in this write. + * + * w_target_page is the page being written to by the user. + * + * w_pages is an array of pages which always contains + * w_target_page, and in the case of an allocating write with + * page_size < cluster size, it will contain zero'd and mapped + * pages adjacent to w_target_page which need to be written + * out in so that future reads from that region will get + * zero's. + */ + struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; + unsigned int w_num_pages; + struct page *w_target_page; + + /* + * ocfs2_write_end() uses this to know what the real range to + * write in the target should be. + */ + unsigned int w_target_from; + unsigned int w_target_to; + + /* + * We could use journal_current_handle() but this is cleaner, + * IMHO -Mark + */ + handle_t *w_handle; + + struct buffer_head *w_di_bh; + + struct ocfs2_cached_dealloc_ctxt w_dealloc; +}; + +void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) +{ + int i; + + for(i = 0; i < num_pages; i++) { + if (pages[i]) { + unlock_page(pages[i]); + mark_page_accessed(pages[i]); + page_cache_release(pages[i]); + } } +} + +static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) +{ + ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); + + brelse(wc->w_di_bh); + kfree(wc); +} - BUG_ON(from > PAGE_CACHE_SIZE); - BUG_ON(to > PAGE_CACHE_SIZE); - BUG_ON(from > osb->s_clustersize); - BUG_ON(to > osb->s_clustersize); +static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, + struct ocfs2_super *osb, loff_t pos, + unsigned len, struct buffer_head *di_bh) +{ + u32 cend; + struct ocfs2_write_ctxt *wc; - src = buf->ops->map(sp->s_pipe, buf, 1); - dst = kmap_atomic(wc->w_this_page, KM_USER1); - memcpy(dst + from, src + src_from, bytes); - kunmap_atomic(wc->w_this_page, KM_USER1); - buf->ops->unmap(sp->s_pipe, buf, src); + wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS); + if (!wc) + return -ENOMEM; - wc->w_finished_copy = 1; + wc->w_cpos = pos >> osb->s_clustersize_bits; + wc->w_first_new_cpos = UINT_MAX; + cend = (pos + len - 1) >> osb->s_clustersize_bits; + wc->w_clen = cend - wc->w_cpos + 1; + get_bh(di_bh); + wc->w_di_bh = di_bh; - *ret_from = from; - *ret_to = to; -out: + if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) + wc->w_large_pages = 1; + else + wc->w_large_pages = 0; - return bytes ? (unsigned int)bytes : ret; + ocfs2_init_dealloc_ctxt(&wc->w_dealloc); + + *wcp = wc; + + return 0; } /* - * This will copy user data from the iovec in the buffered write - * context. + * If a page has any new buffers, zero them out here, and mark them uptodate + * and dirty so they'll be written out (in order to prevent uninitialised + * block data from leaking). And clear the new bit. */ -int ocfs2_map_and_write_user_data(struct inode *inode, - struct ocfs2_write_ctxt *wc, u64 *p_blkno, - unsigned int *ret_from, unsigned int *ret_to) +static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to) { - int ret; - unsigned int to, from, cluster_start, cluster_end; - unsigned long bytes, src_from; - char *dst; - struct ocfs2_buffered_write_priv *bp = wc->w_private; - const struct iovec *cur_iov = bp->b_cur_iov; - char __user *buf; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + unsigned int block_start, block_end; + struct buffer_head *head, *bh; - ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, - &cluster_end); + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + return; - buf = cur_iov->iov_base + bp->b_cur_off; - src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; + bh = head = page_buffers(page); + block_start = 0; + do { + block_end = block_start + bh->b_size; - from = wc->w_pos & (PAGE_CACHE_SIZE - 1); + if (buffer_new(bh)) { + if (block_end > from && block_start < to) { + if (!PageUptodate(page)) { + unsigned start, end; - /* - * This is a lot of comparisons, but it reads quite - * easily, which is important here. - */ - /* Stay within the src page */ - bytes = PAGE_SIZE - src_from; - /* Stay within the vector */ - bytes = min(bytes, - (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); - /* Stay within count */ - bytes = min(bytes, (unsigned long)wc->w_count); - /* - * For clustersize > page size, just stay within - * target page, otherwise we have to calculate pos - * within the cluster and obey the rightmost - * boundary. - */ - if (wc->w_large_pages) { - /* - * For cluster size < page size, we have to - * calculate pos within the cluster and obey - * the rightmost boundary. - */ - bytes = min(bytes, (unsigned long)(osb->s_clustersize - - (wc->w_pos & (osb->s_clustersize - 1)))); - } else { - /* - * cluster size > page size is the most common - * case - we just stay within the target page - * boundary. - */ - bytes = min(bytes, PAGE_CACHE_SIZE - from); - } + start = max(from, block_start); + end = min(to, block_end); - to = from + bytes; + zero_user_segment(page, start, end); + set_buffer_uptodate(bh); + } - if (wc->w_this_page_new) - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, - cluster_start, cluster_end, 1); - else - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, - from, to, 0); - if (ret) { - mlog_errno(ret); - goto out; - } + clear_buffer_new(bh); + mark_buffer_dirty(bh); + } + } - BUG_ON(from > PAGE_CACHE_SIZE); - BUG_ON(to > PAGE_CACHE_SIZE); - BUG_ON(from > osb->s_clustersize); - BUG_ON(to > osb->s_clustersize); + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); +} - dst = kmap(wc->w_this_page); - memcpy(dst + from, bp->b_src_buf + src_from, bytes); - kunmap(wc->w_this_page); +/* + * Only called when we have a failure during allocating write to write + * zero's to the newly allocated region. + */ +static void ocfs2_write_failure(struct inode *inode, + struct ocfs2_write_ctxt *wc, + loff_t user_pos, unsigned user_len) +{ + int i; + unsigned from = user_pos & (PAGE_CACHE_SIZE - 1), + to = user_pos + user_len; + struct page *tmppage; - /* - * XXX: This is slow, but simple. The caller of - * ocfs2_buffered_write_cluster() is responsible for - * passing through the iovecs, so it's difficult to - * predict what our next step is in here after our - * initial write. A future version should be pushing - * that iovec manipulation further down. - * - * By setting this, we indicate that a copy from user - * data was done, and subsequent calls for this - * cluster will skip copying more data. - */ - wc->w_finished_copy = 1; + ocfs2_zero_new_buffers(wc->w_target_page, from, to); - *ret_from = from; - *ret_to = to; -out: + for(i = 0; i < wc->w_num_pages; i++) { + tmppage = wc->w_pages[i]; - return bytes ? (unsigned int)bytes : ret; + if (page_has_buffers(tmppage)) { + if (ocfs2_should_order_data(inode)) + ocfs2_jbd2_file_inode(wc->w_handle, inode); + + block_commit_write(tmppage, from, to); + } + } } -/* - * Map, fill and write a page to disk. - * - * The work of copying data is done via callback. Newly allocated - * pages which don't take user data will be zero'd (set 'new' to - * indicate an allocating write) - * - * Returns a negative error code or the number of bytes copied into - * the page. - */ -int ocfs2_write_data_page(struct inode *inode, handle_t *handle, - u64 *p_blkno, struct page *page, - struct ocfs2_write_ctxt *wc, int new) +static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, + struct ocfs2_write_ctxt *wc, + struct page *page, u32 cpos, + loff_t user_pos, unsigned user_len, + int new) { - int ret, copied = 0; - unsigned int from = 0, to = 0; + int ret; + unsigned int map_from = 0, map_to = 0; unsigned int cluster_start, cluster_end; - unsigned int zero_from = 0, zero_to = 0; + unsigned int user_data_from = 0, user_data_to = 0; - ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, + ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos, &cluster_start, &cluster_end); - if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index - && !wc->w_finished_copy) { - - wc->w_this_page = page; - wc->w_this_page_new = new; - ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); - if (ret < 0) { + if (page == wc->w_target_page) { + map_from = user_pos & (PAGE_CACHE_SIZE - 1); + map_to = map_from + user_len; + + if (new) + ret = ocfs2_map_page_blocks(page, p_blkno, inode, + cluster_start, cluster_end, + new); + else + ret = ocfs2_map_page_blocks(page, p_blkno, inode, + map_from, map_to, new); + if (ret) { mlog_errno(ret); goto out; } - copied = ret; - - zero_from = from; - zero_to = to; + user_data_from = map_from; + user_data_to = map_to; if (new) { - from = cluster_start; - to = cluster_end; + map_from = cluster_start; + map_to = cluster_end; } } else { /* @@ -973,11 +1113,11 @@ int ocfs2_write_data_page(struct inode *inode, handle_t *handle, */ BUG_ON(!new); - from = cluster_start; - to = cluster_end; + map_from = cluster_start; + map_to = cluster_end; ret = ocfs2_map_page_blocks(page, p_blkno, inode, - cluster_start, cluster_end, 1); + cluster_start, cluster_end, new); if (ret) { mlog_errno(ret); goto out; @@ -996,108 +1136,112 @@ int ocfs2_write_data_page(struct inode *inode, handle_t *handle, */ if (new && !PageUptodate(page)) ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), - wc->w_cpos, zero_from, zero_to); + cpos, user_data_from, user_data_to); flush_dcache_page(page); - if (ocfs2_should_order_data(inode)) { - ret = walk_page_buffers(handle, - page_buffers(page), - from, to, NULL, - ocfs2_journal_dirty_data); - if (ret < 0) - mlog_errno(ret); - } - - /* - * We don't use generic_commit_write() because we need to - * handle our own i_size update. - */ - ret = block_commit_write(page, from, to); - if (ret) - mlog_errno(ret); out: - - return copied ? copied : ret; + return ret; } /* - * Do the actual write of some data into an inode. Optionally allocate - * in order to fulfill the write. - * - * cpos is the logical cluster offset within the file to write at - * - * 'phys' is the physical mapping of that offset. a 'phys' value of - * zero indicates that allocation is required. In this case, data_ac - * and meta_ac should be valid (meta_ac can be null if metadata - * allocation isn't required). + * This function will only grab one clusters worth of pages. */ -static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, - struct buffer_head *di_bh, - struct ocfs2_alloc_context *data_ac, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_write_ctxt *wc) +static int ocfs2_grab_pages_for_write(struct address_space *mapping, + struct ocfs2_write_ctxt *wc, + u32 cpos, loff_t user_pos, int new, + struct page *mmap_page) { - int ret, i, numpages = 1, new; - unsigned int copied = 0; - u32 tmp_pos; - u64 v_blkno, p_blkno; - struct address_space *mapping = file->f_mapping; + int ret = 0, i; + unsigned long start, target_index, index; struct inode *inode = mapping->host; - unsigned long index, start; - struct page **cpages; - new = phys == 0 ? 1 : 0; + target_index = user_pos >> PAGE_CACHE_SHIFT; /* * Figure out how many pages we'll be manipulating here. For * non allocating write, we just change the one * page. Otherwise, we'll need a whole clusters worth. */ - if (new) - numpages = ocfs2_pages_per_cluster(inode->i_sb); - - cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); - if (!cpages) { - ret = -ENOMEM; - mlog_errno(ret); - return ret; - } - - /* - * Fill our page array first. That way we've grabbed enough so - * that we can zero and flush if we error after adding the - * extent. - */ if (new) { - start = ocfs2_align_clusters_to_page_index(inode->i_sb, - wc->w_cpos); - v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); + wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); + start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); } else { - start = wc->w_pos >> PAGE_CACHE_SHIFT; - v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; + wc->w_num_pages = 1; + start = target_index; } - for(i = 0; i < numpages; i++) { + for(i = 0; i < wc->w_num_pages; i++) { index = start + i; - cpages[i] = grab_cache_page(mapping, index); - if (!cpages[i]) { - ret = -ENOMEM; - mlog_errno(ret); - goto out; + if (index == target_index && mmap_page) { + /* + * ocfs2_pagemkwrite() is a little different + * and wants us to directly use the page + * passed in. + */ + lock_page(mmap_page); + + if (mmap_page->mapping != mapping) { + unlock_page(mmap_page); + /* + * Sanity check - the locking in + * ocfs2_pagemkwrite() should ensure + * that this code doesn't trigger. + */ + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + + page_cache_get(mmap_page); + wc->w_pages[i] = mmap_page; + } else { + wc->w_pages[i] = find_or_create_page(mapping, index, + GFP_NOFS); + if (!wc->w_pages[i]) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } } + + if (index == target_index) + wc->w_target_page = wc->w_pages[i]; } +out: + return ret; +} + +/* + * Prepare a single cluster for write one cluster into the file. + */ +static int ocfs2_write_cluster(struct address_space *mapping, + u32 phys, unsigned int unwritten, + unsigned int should_zero, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_write_ctxt *wc, u32 cpos, + loff_t user_pos, unsigned user_len) +{ + int ret, i, new; + u64 v_blkno, p_blkno; + struct inode *inode = mapping->host; + struct ocfs2_extent_tree et; + new = phys == 0 ? 1 : 0; if (new) { + u32 tmp_pos; + /* * This is safe to call with the page locks - it won't take * any additional semaphores or cluster locks. */ - tmp_pos = wc->w_cpos; - ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, - &tmp_pos, 1, di_bh, handle, - data_ac, meta_ac, NULL); + tmp_pos = cpos; + ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, + &tmp_pos, 1, 0, wc->w_di_bh, + wc->w_handle, data_ac, + meta_ac, NULL); /* * This shouldn't happen because we must have already * calculated the correct meta data allocation required. The @@ -1114,159 +1258,697 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, mlog_errno(ret); goto out; } + } else if (unwritten) { + ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); + ret = ocfs2_mark_extent_written(inode, &et, + wc->w_handle, cpos, 1, phys, + meta_ac, &wc->w_dealloc); + if (ret < 0) { + mlog_errno(ret); + goto out; + } } + if (should_zero) + v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); + else + v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; + + /* + * The only reason this should fail is due to an inability to + * find the extent added. + */ ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, NULL); if (ret < 0) { - - /* - * XXX: Should we go readonly here? - */ - - mlog_errno(ret); + ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, " + "at logical block %llu", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)v_blkno); goto out; } BUG_ON(p_blkno == 0); - for(i = 0; i < numpages; i++) { - ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], - wc, new); - if (ret < 0) { + for(i = 0; i < wc->w_num_pages; i++) { + int tmpret; + + tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, + wc->w_pages[i], cpos, + user_pos, user_len, + should_zero); + if (tmpret) { + mlog_errno(tmpret); + if (ret == 0) + ret = tmpret; + } + } + + /* + * We only have cleanup to do in case of allocating write. + */ + if (ret && new) + ocfs2_write_failure(inode, wc, user_pos, user_len); + +out: + + return ret; +} + +static int ocfs2_write_cluster_by_desc(struct address_space *mapping, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_write_ctxt *wc, + loff_t pos, unsigned len) +{ + int ret, i; + loff_t cluster_off; + unsigned int local_len = len; + struct ocfs2_write_cluster_desc *desc; + struct ocfs2_super *osb = OCFS2_SB(mapping->host->i_sb); + + for (i = 0; i < wc->w_clen; i++) { + desc = &wc->w_desc[i]; + + /* + * We have to make sure that the total write passed in + * doesn't extend past a single cluster. + */ + local_len = len; + cluster_off = pos & (osb->s_clustersize - 1); + if ((cluster_off + local_len) > osb->s_clustersize) + local_len = osb->s_clustersize - cluster_off; + + ret = ocfs2_write_cluster(mapping, desc->c_phys, + desc->c_unwritten, + desc->c_needs_zero, + data_ac, meta_ac, + wc, desc->c_cpos, pos, local_len); + if (ret) { mlog_errno(ret); goto out; } - copied += ret; + len -= local_len; + pos += local_len; } + ret = 0; out: - for(i = 0; i < numpages; i++) { - unlock_page(cpages[i]); - mark_page_accessed(cpages[i]); - page_cache_release(cpages[i]); - } - kfree(cpages); - - return copied ? copied : ret; + return ret; } -static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, - struct ocfs2_super *osb, loff_t pos, - size_t count, ocfs2_page_writer *cb, - void *cb_priv) +/* + * ocfs2_write_end() wants to know which parts of the target page it + * should complete the write on. It's easiest to compute them ahead of + * time when a more complete view of the write is available. + */ +static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, + struct ocfs2_write_ctxt *wc, + loff_t pos, unsigned len, int alloc) { - wc->w_count = count; - wc->w_pos = pos; - wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; - wc->w_finished_copy = 0; + struct ocfs2_write_cluster_desc *desc; - if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) - wc->w_large_pages = 1; - else - wc->w_large_pages = 0; + wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1); + wc->w_target_to = wc->w_target_from + len; + + if (alloc == 0) + return; + + /* + * Allocating write - we may have different boundaries based + * on page size and cluster size. + * + * NOTE: We can no longer compute one value from the other as + * the actual write length and user provided length may be + * different. + */ - wc->w_write_data_page = cb; - wc->w_private = cb_priv; + if (wc->w_large_pages) { + /* + * We only care about the 1st and last cluster within + * our range and whether they should be zero'd or not. Either + * value may be extended out to the start/end of a + * newly allocated cluster. + */ + desc = &wc->w_desc[0]; + if (desc->c_needs_zero) + ocfs2_figure_cluster_boundaries(osb, + desc->c_cpos, + &wc->w_target_from, + NULL); + + desc = &wc->w_desc[wc->w_clen - 1]; + if (desc->c_needs_zero) + ocfs2_figure_cluster_boundaries(osb, + desc->c_cpos, + NULL, + &wc->w_target_to); + } else { + wc->w_target_from = 0; + wc->w_target_to = PAGE_CACHE_SIZE; + } } /* - * Write a cluster to an inode. The cluster may not be allocated yet, - * in which case it will be. This only exists for buffered writes - - * O_DIRECT takes a more "traditional" path through the kernel. + * Populate each single-cluster write descriptor in the write context + * with information about the i/o to be done. * - * The caller is responsible for incrementing pos, written counts, etc - * - * For file systems that don't support sparse files, pre-allocation - * and page zeroing up until cpos should be done prior to this - * function call. - * - * Callers should be holding i_sem, and the rw cluster lock. - * - * Returns the number of user bytes written, or less than zero for - * error. + * Returns the number of clusters that will have to be allocated, as + * well as a worst case estimate of the number of extent records that + * would have to be created during a write to an unwritten region. */ -ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, - size_t count, ocfs2_page_writer *actor, - void *priv) +static int ocfs2_populate_write_desc(struct inode *inode, + struct ocfs2_write_ctxt *wc, + unsigned int *clusters_to_alloc, + unsigned int *extents_to_split) { - int ret, credits = OCFS2_INODE_UPDATE_CREDITS; - ssize_t written = 0; - u32 phys; - struct inode *inode = file->f_mapping->host; + int ret; + struct ocfs2_write_cluster_desc *desc; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + u32 phys = 0; + int i; + + *clusters_to_alloc = 0; + *extents_to_split = 0; + + for (i = 0; i < wc->w_clen; i++) { + desc = &wc->w_desc[i]; + desc->c_cpos = wc->w_cpos + i; + + if (num_clusters == 0) { + /* + * Need to look up the next extent record. + */ + ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Assume worst case - that we're writing in + * the middle of the extent. + * + * We can assume that the write proceeds from + * left to right, in which case the extent + * insert code is smart enough to coalesce the + * next splits into the previous records created. + */ + if (ext_flags & OCFS2_EXT_UNWRITTEN) + *extents_to_split = *extents_to_split + 2; + } else if (phys) { + /* + * Only increment phys if it doesn't describe + * a hole. + */ + phys++; + } + + /* + * If w_first_new_cpos is < UINT_MAX, we have a non-sparse + * file that got extended. w_first_new_cpos tells us + * where the newly allocated clusters are so we can + * zero them. + */ + if (desc->c_cpos >= wc->w_first_new_cpos) { + BUG_ON(phys == 0); + desc->c_needs_zero = 1; + } + + desc->c_phys = phys; + if (phys == 0) { + desc->c_new = 1; + desc->c_needs_zero = 1; + *clusters_to_alloc = *clusters_to_alloc + 1; + } + + if (ext_flags & OCFS2_EXT_UNWRITTEN) { + desc->c_unwritten = 1; + desc->c_needs_zero = 1; + } + + num_clusters--; + } + + ret = 0; +out: + return ret; +} + +static int ocfs2_write_begin_inline(struct address_space *mapping, + struct inode *inode, + struct ocfs2_write_ctxt *wc) +{ + int ret; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct buffer_head *di_bh = NULL; - struct ocfs2_dinode *di; - struct ocfs2_alloc_context *data_ac = NULL; - struct ocfs2_alloc_context *meta_ac = NULL; + struct page *page; handle_t *handle; - struct ocfs2_write_ctxt wc; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + + page = find_or_create_page(mapping, 0, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + /* + * If we don't set w_num_pages then this page won't get unlocked + * and freed on cleanup of the write context. + */ + wc->w_pages[0] = wc->w_target_page = page; + wc->w_num_pages = 1; - ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } - ret = ocfs2_meta_lock(inode, &di_bh, 1); + ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { + ocfs2_commit_trans(osb, handle); + mlog_errno(ret); goto out; } - di = (struct ocfs2_dinode *)di_bh->b_data; + + if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) + ocfs2_set_inode_data_inline(inode, di); + + if (!PageUptodate(page)) { + ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh); + if (ret) { + ocfs2_commit_trans(osb, handle); + + goto out; + } + } + + wc->w_handle = handle; +out: + return ret; +} + +int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + if (new_size <= le16_to_cpu(di->id2.i_data.id_count)) + return 1; + return 0; +} + +static int ocfs2_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, loff_t pos, + unsigned len, struct page *mmap_page, + struct ocfs2_write_ctxt *wc) +{ + int ret, written = 0; + loff_t end = pos + len; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = NULL; + + mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n", + (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos, + oi->ip_dyn_features); /* - * Take alloc sem here to prevent concurrent lookups. That way - * the mapping, zeroing and tree manipulation within - * ocfs2_write() will be safe against ->readpage(). This - * should also serve to lock out allocation from a shared - * writeable region. + * Handle inodes which already have inline data 1st. */ - down_write(&OCFS2_I(inode)->ip_alloc_sem); + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if (mmap_page == NULL && + ocfs2_size_fits_inline_data(wc->w_di_bh, end)) + goto do_inline_write; - ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); + /* + * The write won't fit - we have to give this inode an + * inline extent list now. + */ + ret = ocfs2_convert_inline_data_to_extents(inode, wc->w_di_bh); + if (ret) + mlog_errno(ret); + goto out; + } + + /* + * Check whether the inode can accept inline data. + */ + if (oi->ip_clusters != 0 || i_size_read(inode) != 0) + return 0; + + /* + * Check whether the write can fit. + */ + di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + if (mmap_page || + end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) + return 0; + +do_inline_write: + ret = ocfs2_write_begin_inline(mapping, inode, wc); if (ret) { mlog_errno(ret); - goto out_meta; + goto out; } - /* phys == 0 means that allocation is required. */ - if (phys == 0) { - ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); - if (ret) { + /* + * This signals to the caller that the data can be written + * inline. + */ + written = 1; +out: + return written ? written : ret; +} + +/* + * This function only does anything for file systems which can't + * handle sparse files. + * + * What we want to do here is fill in any hole between the current end + * of allocation and the end of our write. That way the rest of the + * write path can treat it as an non-allocating write, which has no + * special case code for sparse/nonsparse files. + */ +static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, + unsigned len, + struct ocfs2_write_ctxt *wc) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + loff_t newsize = pos + len; + + if (ocfs2_sparse_alloc(osb)) + return 0; + + if (newsize <= i_size_read(inode)) + return 0; + + ret = ocfs2_extend_no_holes(inode, newsize, pos); + if (ret) + mlog_errno(ret); + + wc->w_first_new_cpos = + ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); + + return ret; +} + +int ocfs2_write_begin_nolock(struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + struct buffer_head *di_bh, struct page *mmap_page) +{ + int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; + unsigned int clusters_to_alloc, extents_to_split; + struct ocfs2_write_ctxt *wc; + struct inode *inode = mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle; + struct ocfs2_extent_tree et; + + ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); + if (ret) { + mlog_errno(ret); + return ret; + } + + if (ocfs2_supports_inline_data(osb)) { + ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len, + mmap_page, wc); + if (ret == 1) { + ret = 0; + goto success; + } + if (ret < 0) { mlog_errno(ret); - goto out_meta; + goto out; } + } - credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); + ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc); + if (ret) { + mlog_errno(ret); + goto out; } - ret = ocfs2_data_lock(inode, 1); + ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, + &extents_to_split); if (ret) { mlog_errno(ret); - goto out_meta; + goto out; } + di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + + /* + * We set w_target_from, w_target_to here so that + * ocfs2_write_end() knows which range in the target page to + * write out. An allocation requires that we write the entire + * cluster range. + */ + if (clusters_to_alloc || extents_to_split) { + /* + * XXX: We are stretching the limits of + * ocfs2_lock_allocators(). It greatly over-estimates + * the work to be done. + */ + mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u," + " clusters_to_add = %u, extents_to_split = %u\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), + clusters_to_alloc, extents_to_split); + + ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); + ret = ocfs2_lock_allocators(inode, &et, + clusters_to_alloc, extents_to_split, + &data_ac, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + credits = ocfs2_calc_extend_credits(inode->i_sb, + &di->id2.i_list, + clusters_to_alloc); + + } + + /* + * We have to zero sparse allocated clusters, unwritten extent clusters, + * and non-sparse clusters we just extended. For non-sparse writes, + * we know zeros will only be needed in the first and/or last cluster. + */ + if (clusters_to_alloc || extents_to_split || + wc->w_desc[0].c_needs_zero || + wc->w_desc[wc->w_clen - 1].c_needs_zero) + cluster_of_pages = 1; + else + cluster_of_pages = 0; + + ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages); + handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); - goto out_data; + goto out; } - written = ocfs2_write(file, phys, handle, di_bh, data_ac, - meta_ac, &wc); - if (written < 0) { - ret = written; - mlog_errno(ret); + wc->w_handle = handle; + + if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) { + ret = -EDQUOT; goto out_commit; } + /* + * We don't want this to fail in ocfs2_write_end(), so do it + * here. + */ + ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_quota; + } - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + /* + * Fill our page array first. That way we've grabbed enough so + * that we can zero and flush if we error after adding the + * extent. + */ + ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, + cluster_of_pages, mmap_page); if (ret) { mlog_errno(ret); - goto out_commit; + goto out_quota; + } + + ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, + len); + if (ret) { + mlog_errno(ret); + goto out_quota; + } + + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + +success: + *pagep = wc->w_target_page; + *fsdata = wc; + return 0; +out_quota: + if (clusters_to_alloc) + vfs_dq_free_space(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + ocfs2_free_write_ctxt(wc); + + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct inode *inode = mapping->host; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + return ret; + } + + /* + * Take alloc sem here to prevent concurrent lookups. That way + * the mapping, zeroing and tree manipulation within + * ocfs2_write() will be safe against ->readpage(). This + * should also serve to lock out allocation from a shared + * writeable region. + */ + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, + fsdata, di_bh, NULL); + if (ret) { + mlog_errno(ret); + goto out_fail; + } + + brelse(di_bh); + + return 0; + +out_fail: + up_write(&OCFS2_I(inode)->ip_alloc_sem); + + brelse(di_bh); + ocfs2_inode_unlock(inode, 1); + + return ret; +} + +static void ocfs2_write_end_inline(struct inode *inode, loff_t pos, + unsigned len, unsigned *copied, + struct ocfs2_dinode *di, + struct ocfs2_write_ctxt *wc) +{ + void *kaddr; + + if (unlikely(*copied < len)) { + if (!PageUptodate(wc->w_target_page)) { + *copied = 0; + return; + } } - pos += written; + kaddr = kmap_atomic(wc->w_target_page, KM_USER0); + memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied); + kunmap_atomic(kaddr, KM_USER0); + + mlog(0, "Data written to inode at offset %llu. " + "id_count = %u, copied = %u, i_dyn_features = 0x%x\n", + (unsigned long long)pos, *copied, + le16_to_cpu(di->id2.i_data.id_count), + le16_to_cpu(di->i_dyn_features)); +} + +int ocfs2_write_end_nolock(struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int i; + unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); + struct inode *inode = mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_write_ctxt *wc = fsdata; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + handle_t *handle = wc->w_handle; + struct page *tmppage; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ocfs2_write_end_inline(inode, pos, len, &copied, di, wc); + goto out_write_size; + } + + if (unlikely(copied < len)) { + if (!PageUptodate(wc->w_target_page)) + copied = 0; + + ocfs2_zero_new_buffers(wc->w_target_page, start+copied, + start+len); + } + flush_dcache_page(wc->w_target_page); + + for(i = 0; i < wc->w_num_pages; i++) { + tmppage = wc->w_pages[i]; + + if (tmppage == wc->w_target_page) { + from = wc->w_target_from; + to = wc->w_target_to; + + BUG_ON(from > PAGE_CACHE_SIZE || + to > PAGE_CACHE_SIZE || + to < from); + } else { + /* + * Pages adjacent to the target (if any) imply + * a hole-filling write in which case we want + * to flush their entire range. + */ + from = 0; + to = PAGE_CACHE_SIZE; + } + + if (page_has_buffers(tmppage)) { + if (ocfs2_should_order_data(inode)) + ocfs2_jbd2_file_inode(wc->w_handle, inode); + block_commit_write(tmppage, from, to); + } + } + +out_write_size: + pos += copied; if (pos > inode->i_size) { i_size_write(inode, pos); mark_inode_dirty(inode); @@ -1276,38 +1958,43 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, inode->i_mtime = inode->i_ctime = CURRENT_TIME; di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ocfs2_journal_dirty(handle, wc->w_di_bh); - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret) - mlog_errno(ret); - -out_commit: ocfs2_commit_trans(osb, handle); -out_data: - ocfs2_data_unlock(inode, 1); + ocfs2_run_deallocs(osb, &wc->w_dealloc); -out_meta: - up_write(&OCFS2_I(inode)->ip_alloc_sem); - ocfs2_meta_unlock(inode, 1); + ocfs2_free_write_ctxt(wc); -out: - brelse(di_bh); - if (data_ac) - ocfs2_free_alloc_context(data_ac); - if (meta_ac) - ocfs2_free_alloc_context(meta_ac); + return copied; +} - return written ? written : ret; +static int ocfs2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int ret; + struct inode *inode = mapping->host; + + ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); + + up_write(&OCFS2_I(inode)->ip_alloc_sem); + ocfs2_inode_unlock(inode, 1); + + return ret; } const struct address_space_operations ocfs2_aops = { - .readpage = ocfs2_readpage, - .writepage = ocfs2_writepage, - .bmap = ocfs2_bmap, - .sync_page = block_sync_page, - .direct_IO = ocfs2_direct_IO, - .invalidatepage = ocfs2_invalidatepage, - .releasepage = ocfs2_releasepage, - .migratepage = buffer_migrate_page, + .readpage = ocfs2_readpage, + .readpages = ocfs2_readpages, + .writepage = ocfs2_writepage, + .write_begin = ocfs2_write_begin, + .write_end = ocfs2_write_end, + .bmap = ocfs2_bmap, + .sync_page = block_sync_page, + .direct_IO = ocfs2_direct_IO, + .invalidatepage = ocfs2_invalidatepage, + .releasepage = ocfs2_releasepage, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, };