X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=fs%2Fxfs%2Flinux-2.6%2Fxfs_aops.c;h=303a779406c04cc9816185be6439140709fdd219;hb=8699bb0a480193e62d5ccb9c86e2c26b407090a8;hp=6f4c29e9c3d9e6209e5c999df4e61b10ba2dd902;hpb=782e3b3b3804c38d5130c7f21d7ec7bf6709023f;p=safe%2Fjmp%2Flinux-2.6 diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 6f4c29e..303a779 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -37,11 +37,49 @@ #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_iomap.h" +#include "xfs_vnodeops.h" +#include "xfs_trace.h" +#include "xfs_bmap.h" +#include #include #include #include + +/* + * Prime number of hash buckets since address is used as the key. + */ +#define NVSYNC 37 +#define to_ioend_wq(v) (&xfs_ioend_wq[((unsigned long)v) % NVSYNC]) +static wait_queue_head_t xfs_ioend_wq[NVSYNC]; + +void __init +xfs_ioend_init(void) +{ + int i; + + for (i = 0; i < NVSYNC; i++) + init_waitqueue_head(&xfs_ioend_wq[i]); +} + +void +xfs_ioend_wait( + xfs_inode_t *ip) +{ + wait_queue_head_t *wq = to_ioend_wq(ip); + + wait_event(*wq, (atomic_read(&ip->i_iocount) == 0)); +} + STATIC void +xfs_ioend_wake( + xfs_inode_t *ip) +{ + if (atomic_dec_and_test(&ip->i_iocount)) + wake_up(to_ioend_wq(ip)); +} + +void xfs_count_page_state( struct page *page, int *delalloc, @@ -63,64 +101,17 @@ xfs_count_page_state( } while ((bh = bh->b_this_page) != head); } -#if defined(XFS_RW_TRACE) -void -xfs_page_trace( - int tag, - struct inode *inode, - struct page *page, - unsigned long pgoff) +STATIC struct block_device * +xfs_find_bdev_for_inode( + struct inode *inode) { - xfs_inode_t *ip; - bhv_vnode_t *vp = vn_from_inode(inode); - loff_t isize = i_size_read(inode); - loff_t offset = page_offset(page); - int delalloc = -1, unmapped = -1, unwritten = -1; - - if (page_has_buffers(page)) - xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); - - ip = xfs_vtoi(vp); - if (!ip->i_rwtrace) - return; - - ktrace_enter(ip->i_rwtrace, - (void *)((unsigned long)tag), - (void *)ip, - (void *)inode, - (void *)page, - (void *)pgoff, - (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), - (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), - (void *)((unsigned long)((isize >> 32) & 0xffffffff)), - (void *)((unsigned long)(isize & 0xffffffff)), - (void *)((unsigned long)((offset >> 32) & 0xffffffff)), - (void *)((unsigned long)(offset & 0xffffffff)), - (void *)((unsigned long)delalloc), - (void *)((unsigned long)unmapped), - (void *)((unsigned long)unwritten), - (void *)((unsigned long)current_pid()), - (void *)NULL); -} -#else -#define xfs_page_trace(tag, inode, page, pgoff) -#endif + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; -/* - * Schedule IO completion handling on a xfsdatad if this was - * the final hold on this ioend. If we are asked to wait, - * flush the workqueue. - */ -STATIC void -xfs_finish_ioend( - xfs_ioend_t *ioend, - int wait) -{ - if (atomic_dec_and_test(&ioend->io_remaining)) { - queue_work(xfsdatad_workqueue, &ioend->io_work); - if (wait) - flush_workqueue(xfsdatad_workqueue); - } + if (XFS_IS_REALTIME_INODE(ip)) + return mp->m_rtdev_targp->bt_bdev; + else + return mp->m_ddev_targp->bt_bdev; } /* @@ -134,121 +125,149 @@ xfs_destroy_ioend( xfs_ioend_t *ioend) { struct buffer_head *bh, *next; + struct xfs_inode *ip = XFS_I(ioend->io_inode); for (bh = ioend->io_buffer_head; bh; bh = next) { next = bh->b_private; bh->b_end_io(bh, !ioend->io_error); } - if (unlikely(ioend->io_error)) - vn_ioerror(ioend->io_vnode, ioend->io_error, __FILE__,__LINE__); - vn_iowake(ioend->io_vnode); + + /* + * Volume managers supporting multiple paths can send back ENODEV + * when the final path disappears. In this case continuing to fill + * the page cache with dirty data which cannot be written out is + * evil, so prevent that. + */ + if (unlikely(ioend->io_error == -ENODEV)) { + xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, + __FILE__, __LINE__); + } + + xfs_ioend_wake(ip); mempool_free(ioend, xfs_ioend_pool); } /* - * Update on-disk file size now that data has been written to disk. - * The current in-memory file size is i_size. If a write is beyond - * eof io_new_size will be the intended file size until i_size is - * updated. If this write does not extend all the way to the valid - * file size then restrict this update to the end of the write. + * If the end of the current ioend is beyond the current EOF, + * return the new EOF value, otherwise zero. */ -STATIC void -xfs_setfilesize( +STATIC xfs_fsize_t +xfs_ioend_new_eof( xfs_ioend_t *ioend) { - xfs_inode_t *ip; + xfs_inode_t *ip = XFS_I(ioend->io_inode); xfs_fsize_t isize; xfs_fsize_t bsize; - ip = xfs_vtoi(ioend->io_vnode); - if (!ip) - return; + bsize = ioend->io_offset + ioend->io_size; + isize = MAX(ip->i_size, ip->i_new_size); + isize = MIN(isize, bsize); + return isize > ip->i_d.di_size ? isize : 0; +} + +/* + * Update on-disk file size now that data has been written to disk. The + * current in-memory file size is i_size. If a write is beyond eof i_new_size + * will be the intended file size until i_size is updated. If this write does + * not extend all the way to the valid file size then restrict this update to + * the end of the write. + * + * This function does not block as blocking on the inode lock in IO completion + * can lead to IO completion order dependency deadlocks.. If it can't get the + * inode ilock it will return EAGAIN. Callers must handle this. + */ +STATIC int +xfs_setfilesize( + xfs_ioend_t *ioend) +{ + xfs_inode_t *ip = XFS_I(ioend->io_inode); + xfs_fsize_t isize; ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); ASSERT(ioend->io_type != IOMAP_READ); if (unlikely(ioend->io_error)) - return; - - bsize = ioend->io_offset + ioend->io_size; - - xfs_ilock(ip, XFS_ILOCK_EXCL); + return 0; - isize = MAX(ip->i_size, ip->i_iocore.io_new_size); - isize = MIN(isize, bsize); + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + return EAGAIN; - if (ip->i_d.di_size < isize) { + isize = xfs_ioend_new_eof(ioend); + if (isize) { ip->i_d.di_size = isize; - ip->i_update_core = 1; - ip->i_update_size = 1; - mark_inode_dirty_sync(vn_to_inode(ioend->io_vnode)); + xfs_mark_inode_dirty(ip); } xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; } /* - * Buffered IO write completion for delayed allocate extents. + * Schedule IO completion handling on a xfsdatad if this was + * the final hold on this ioend. If we are asked to wait, + * flush the workqueue. */ STATIC void -xfs_end_bio_delalloc( - struct work_struct *work) +xfs_finish_ioend( + xfs_ioend_t *ioend, + int wait) { - xfs_ioend_t *ioend = - container_of(work, xfs_ioend_t, io_work); + if (atomic_dec_and_test(&ioend->io_remaining)) { + struct workqueue_struct *wq; - xfs_setfilesize(ioend); - xfs_destroy_ioend(ioend); + wq = (ioend->io_type == IOMAP_UNWRITTEN) ? + xfsconvertd_workqueue : xfsdatad_workqueue; + queue_work(wq, &ioend->io_work); + if (wait) + flush_workqueue(wq); + } } /* - * Buffered IO write completion for regular, written extents. + * IO write completion. */ STATIC void -xfs_end_bio_written( - struct work_struct *work) +xfs_end_io( + struct work_struct *work) { - xfs_ioend_t *ioend = - container_of(work, xfs_ioend_t, io_work); + xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); + struct xfs_inode *ip = XFS_I(ioend->io_inode); + int error = 0; - xfs_setfilesize(ioend); - xfs_destroy_ioend(ioend); -} + /* + * For unwritten extents we need to issue transactions to convert a + * range to normal written extens after the data I/O has finished. + */ + if (ioend->io_type == IOMAP_UNWRITTEN && + likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { -/* - * IO write completion for unwritten extents. - * - * Issue transactions to convert a buffer range from unwritten - * to written extents. - */ -STATIC void -xfs_end_bio_unwritten( - struct work_struct *work) -{ - xfs_ioend_t *ioend = - container_of(work, xfs_ioend_t, io_work); - bhv_vnode_t *vp = ioend->io_vnode; - xfs_off_t offset = ioend->io_offset; - size_t size = ioend->io_size; - - if (likely(!ioend->io_error)) { - bhv_vop_bmap(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL); - xfs_setfilesize(ioend); + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, + ioend->io_size); + if (error) + ioend->io_error = error; } - xfs_destroy_ioend(ioend); -} -/* - * IO read completion for regular, written extents. - */ -STATIC void -xfs_end_bio_read( - struct work_struct *work) -{ - xfs_ioend_t *ioend = - container_of(work, xfs_ioend_t, io_work); + /* + * We might have to update the on-disk file size after extending + * writes. + */ + if (ioend->io_type != IOMAP_READ) { + error = xfs_setfilesize(ioend); + ASSERT(!error || error == EAGAIN); + } - xfs_destroy_ioend(ioend); + /* + * If we didn't complete processing of the ioend, requeue it to the + * tail of the workqueue for another attempt later. Otherwise destroy + * it. + */ + if (error == EAGAIN) { + atomic_inc(&ioend->io_remaining); + xfs_finish_ioend(ioend, 0); + /* ensure we don't spin on blocked ioends */ + delay(1); + } else + xfs_destroy_ioend(ioend); } /* @@ -275,22 +294,14 @@ xfs_alloc_ioend( ioend->io_error = 0; ioend->io_list = NULL; ioend->io_type = type; - ioend->io_vnode = vn_from_inode(inode); + ioend->io_inode = inode; ioend->io_buffer_head = NULL; ioend->io_buffer_tail = NULL; - atomic_inc(&ioend->io_vnode->v_iocount); + atomic_inc(&XFS_I(ioend->io_inode)->i_iocount); ioend->io_offset = 0; ioend->io_size = 0; - if (type == IOMAP_UNWRITTEN) - INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten); - else if (type == IOMAP_DELAY) - INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc); - else if (type == IOMAP_READ) - INIT_WORK(&ioend->io_work, xfs_end_bio_read); - else - INIT_WORK(&ioend->io_work, xfs_end_bio_written); - + INIT_WORK(&ioend->io_work, xfs_end_io); return ioend; } @@ -302,22 +313,23 @@ xfs_map_blocks( xfs_iomap_t *mapp, int flags) { - bhv_vnode_t *vp = vn_from_inode(inode); - int error, nmaps = 1; + int nmaps = 1; - error = bhv_vop_bmap(vp, offset, count, flags, mapp, &nmaps); - if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE))) - VMODIFY(vp); - return -error; + return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps); } -STATIC_INLINE int +STATIC int xfs_iomap_valid( + struct inode *inode, xfs_iomap_t *iomapp, loff_t offset) { - return offset >= iomapp->iomap_offset && - offset < iomapp->iomap_offset + iomapp->iomap_bsize; + struct xfs_mount *mp = XFS_I(inode)->i_mount; + xfs_off_t iomap_offset = XFS_FSB_TO_B(mp, iomapp->iomap_offset); + xfs_off_t iomap_bsize = XFS_FSB_TO_B(mp, iomapp->iomap_bsize); + + return offset >= iomap_offset && + offset < iomap_offset + iomap_bsize; } /* @@ -343,15 +355,23 @@ xfs_end_bio( STATIC void xfs_submit_ioend_bio( - xfs_ioend_t *ioend, - struct bio *bio) + struct writeback_control *wbc, + xfs_ioend_t *ioend, + struct bio *bio) { atomic_inc(&ioend->io_remaining); - bio->bi_private = ioend; bio->bi_end_io = xfs_end_bio; - submit_bio(WRITE, bio); + /* + * If the I/O is beyond EOF we mark the inode dirty immediately + * but don't update the inode size until I/O completion. + */ + if (xfs_ioend_new_eof(ioend)) + xfs_mark_inode_dirty(XFS_I(ioend->io_inode)); + + submit_bio(wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC_PLUG : WRITE, bio); ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP)); bio_put(bio); } @@ -392,7 +412,6 @@ xfs_start_buffer_writeback( STATIC void xfs_start_page_writeback( struct page *page, - struct writeback_control *wbc, int clear_dirty, int buffers) { @@ -402,10 +421,9 @@ xfs_start_page_writeback( clear_page_dirty_for_io(page); set_page_writeback(page); unlock_page(page); - if (!buffers) { + /* If no buffers on the page are to be written, finish it here */ + if (!buffers) end_page_writeback(page); - wbc->pages_skipped++; /* We didn't write this page */ - } } static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) @@ -432,6 +450,7 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) */ STATIC void xfs_submit_ioend( + struct writeback_control *wbc, xfs_ioend_t *ioend) { xfs_ioend_t *head = ioend; @@ -460,19 +479,19 @@ xfs_submit_ioend( retry: bio = xfs_alloc_ioend_bio(bh); } else if (bh->b_blocknr != lastblock + 1) { - xfs_submit_ioend_bio(ioend, bio); + xfs_submit_ioend_bio(wbc, ioend, bio); goto retry; } if (bio_add_buffer(bio, bh) != bh->b_size) { - xfs_submit_ioend_bio(ioend, bio); + xfs_submit_ioend_bio(wbc, ioend, bio); goto retry; } lastblock = bh->b_blocknr; } if (bio) - xfs_submit_ioend_bio(ioend, bio); + xfs_submit_ioend_bio(wbc, ioend, bio); xfs_finish_ioend(ioend, 0); } while ((ioend = next) != NULL); } @@ -498,7 +517,7 @@ xfs_cancel_ioend( unlock_buffer(bh); } while ((bh = next_bh) != NULL); - vn_iowake(ioend->io_vnode); + xfs_ioend_wake(XFS_I(ioend->io_inode)); mempool_free(ioend, xfs_ioend_pool); } while ((ioend = next) != NULL); } @@ -541,19 +560,21 @@ xfs_add_to_ioend( STATIC void xfs_map_buffer( + struct inode *inode, struct buffer_head *bh, xfs_iomap_t *mp, - xfs_off_t offset, - uint block_bits) + xfs_off_t offset) { sector_t bn; + struct xfs_mount *m = XFS_I(inode)->i_mount; + xfs_off_t iomap_offset = XFS_FSB_TO_B(m, mp->iomap_offset); ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL); - bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) + - ((offset - mp->iomap_offset) >> block_bits); + bn = (mp->iomap_bn >> (inode->i_blkbits - BBSHIFT)) + + ((offset - iomap_offset) >> inode->i_blkbits); - ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME)); + ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); bh->b_blocknr = bn; set_buffer_mapped(bh); @@ -561,17 +582,17 @@ xfs_map_buffer( STATIC void xfs_map_at_offset( + struct inode *inode, struct buffer_head *bh, - loff_t offset, - int block_bits, - xfs_iomap_t *iomapp) + xfs_iomap_t *iomapp, + xfs_off_t offset) { ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE)); ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY)); lock_buffer(bh); - xfs_map_buffer(bh, iomapp, offset, block_bits); - bh->b_bdev = iomapp->iomap_target->bt_bdev; + xfs_map_buffer(inode, bh, iomapp, offset); + bh->b_bdev = xfs_find_bdev_for_inode(inode); set_buffer_mapped(bh); clear_buffer_delay(bh); clear_buffer_unwritten(bh); @@ -660,7 +681,7 @@ xfs_probe_cluster( } else pg_offset = PAGE_CACHE_SIZE; - if (page->index == tindex && !TestSetPageLocked(page)) { + if (page->index == tindex && trylock_page(page)) { pg_len = xfs_probe_page(page, pg_offset, mapped); unlock_page(page); } @@ -737,14 +758,13 @@ xfs_convert_page( xfs_off_t end_offset; unsigned long p_offset; unsigned int type; - int bbits = inode->i_blkbits; int len, page_dirty; int count = 0, done = 0, uptodate = 1; xfs_off_t offset = page_offset(page); if (page->index != tindex) goto fail; - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto fail; if (PageWriteback(page)) goto fail_unlock_page; @@ -793,7 +813,7 @@ xfs_convert_page( else type = IOMAP_DELAY; - if (!xfs_iomap_valid(mp, offset)) { + if (!xfs_iomap_valid(inode, mp, offset)) { done = 1; continue; } @@ -801,7 +821,7 @@ xfs_convert_page( ASSERT(!(mp->iomap_flags & IOMAP_HOLE)); ASSERT(!(mp->iomap_flags & IOMAP_DELAY)); - xfs_map_at_offset(bh, offset, bbits, mp); + xfs_map_at_offset(inode, bh, mp, offset); if (startio) { xfs_add_to_ioend(inode, bh, offset, type, ioendp, done); @@ -831,18 +851,11 @@ xfs_convert_page( if (startio) { if (count) { - struct backing_dev_info *bdi; - - bdi = inode->i_mapping->backing_dev_info; wbc->nr_to_write--; - if (bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - done = 1; - } else if (wbc->nr_to_write <= 0) { + if (wbc->nr_to_write <= 0) done = 1; - } } - xfs_start_page_writeback(page, wbc, !page_dirty, count); + xfs_start_page_writeback(page, !page_dirty, count); } return done; @@ -889,6 +902,125 @@ xfs_cluster_write( } } +STATIC void +xfs_vm_invalidatepage( + struct page *page, + unsigned long offset) +{ + trace_xfs_invalidatepage(page->mapping->host, page, offset); + block_invalidatepage(page, offset); +} + +/* + * If the page has delalloc buffers on it, we need to punch them out before we + * invalidate the page. If we don't, we leave a stale delalloc mapping on the + * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read + * is done on that same region - the delalloc extent is returned when none is + * supposed to be there. + * + * We prevent this by truncating away the delalloc regions on the page before + * invalidating it. Because they are delalloc, we can do this without needing a + * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this + * truncation without a transaction as there is no space left for block + * reservation (typically why we see a ENOSPC in writeback). + * + * This is not a performance critical path, so for now just do the punching a + * buffer head at a time. + */ +STATIC void +xfs_aops_discard_page( + struct page *page) +{ + struct inode *inode = page->mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct buffer_head *bh, *head; + loff_t offset = page_offset(page); + ssize_t len = 1 << inode->i_blkbits; + + if (!xfs_is_delayed_page(page, IOMAP_DELAY)) + goto out_invalidate; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + goto out_invalidate; + + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "page discard on page %p, inode 0x%llx, offset %llu.", + page, ip->i_ino, offset); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + bh = head = page_buffers(page); + do { + int done; + xfs_fileoff_t offset_fsb; + xfs_bmbt_irec_t imap; + int nimaps = 1; + int error; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + + if (!buffer_delay(bh)) + goto next_buffer; + + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + + /* + * Map the range first and check that it is a delalloc extent + * before trying to unmap the range. Otherwise we will be + * trying to remove a real extent (which requires a + * transaction) or a hole, which is probably a bad idea... + */ + error = xfs_bmapi(NULL, ip, offset_fsb, 1, + XFS_BMAPI_ENTIRE, NULL, 0, &imap, + &nimaps, NULL, NULL); + + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "page discard failed delalloc mapping lookup."); + } + break; + } + if (!nimaps) { + /* nothing there */ + goto next_buffer; + } + if (imap.br_startblock != DELAYSTARTBLOCK) { + /* been converted, ignore */ + goto next_buffer; + } + WARN_ON(imap.br_blockcount == 0); + + /* + * Note: while we initialise the firstblock/flist pair, they + * should never be used because blocks should never be + * allocated or freed for a delalloc extent and hence we need + * don't cancel or finish them after the xfs_bunmapi() call. + */ + xfs_bmap_init(&flist, &firstblock); + error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock, + &flist, NULL, &done); + + ASSERT(!flist.xbf_count && !flist.xbf_first); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "page discard unable to remove delalloc mapping."); + } + break; + } +next_buffer: + offset += len; + + } while ((bh = bh->b_this_page) != head); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out_invalidate: + xfs_vm_invalidatepage(page, 0); + return; +} + /* * Calling this without startio set means we are being asked to make a dirty * page ready for freeing it's buffers. When called with startio set then @@ -991,7 +1123,7 @@ xfs_page_state_convert( } if (iomap_valid) - iomap_valid = xfs_iomap_valid(&iomap, offset); + iomap_valid = xfs_iomap_valid(inode, &iomap, offset); /* * First case, map an unwritten extent and prepare for @@ -1046,11 +1178,10 @@ xfs_page_state_convert( &iomap, flags); if (err) goto error; - iomap_valid = xfs_iomap_valid(&iomap, offset); + iomap_valid = xfs_iomap_valid(inode, &iomap, offset); } if (iomap_valid) { - xfs_map_at_offset(bh, offset, - inode->i_blkbits, &iomap); + xfs_map_at_offset(inode, bh, &iomap, offset); if (startio) { xfs_add_to_ioend(inode, bh, offset, type, &ioend, @@ -1077,7 +1208,7 @@ xfs_page_state_convert( &iomap, flags); if (err) goto error; - iomap_valid = xfs_iomap_valid(&iomap, offset); + iomap_valid = xfs_iomap_valid(inode, &iomap, offset); } /* @@ -1089,7 +1220,7 @@ xfs_page_state_convert( * that we are writing into for the first time. */ type = IOMAP_NEW; - if (!test_and_set_bit(BH_Lock, &bh->b_state)) { + if (trylock_buffer(bh)) { ASSERT(buffer_mapped(bh)); if (iomap_valid) all_bh = 1; @@ -1114,10 +1245,14 @@ xfs_page_state_convert( SetPageUptodate(page); if (startio) - xfs_start_page_writeback(page, wbc, 1, count); + xfs_start_page_writeback(page, 1, count); if (ioend && iomap_valid) { - offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >> + struct xfs_mount *m = XFS_I(inode)->i_mount; + xfs_off_t iomap_offset = XFS_FSB_TO_B(m, iomap.iomap_offset); + xfs_off_t iomap_bsize = XFS_FSB_TO_B(m, iomap.iomap_bsize); + + offset = (iomap_offset + iomap_bsize - 1) >> PAGE_CACHE_SHIFT; tlast = min_t(pgoff_t, offset, last_index); xfs_cluster_write(inode, page->index + 1, &iomap, &ioend, @@ -1125,7 +1260,7 @@ xfs_page_state_convert( } if (iohead) - xfs_submit_ioend(iohead); + xfs_submit_ioend(wbc, iohead); return page_dirty; @@ -1140,7 +1275,7 @@ error: */ if (err != -EAGAIN) { if (!unmapped) - block_invalidatepage(page, 0); + xfs_aops_discard_page(page); ClearPageUptodate(page); } return err; @@ -1176,7 +1311,7 @@ xfs_vm_writepage( int delalloc, unmapped, unwritten; struct inode *inode = page->mapping->host; - xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0); + trace_xfs_writepage(inode, page, 0); /* * We need a transaction if: @@ -1212,6 +1347,14 @@ xfs_vm_writepage( if (!page_has_buffers(page)) create_empty_buffers(page, 1 << inode->i_blkbits, 0); + + /* + * VM calculation for nr_to_write seems off. Bump it way + * up, this gets simple streaming writes zippy again. + * To be reviewed again after Jens' writeback changes. + */ + wbc->nr_to_write *= 4; + /* * Convert delayed allocate, unwritten or unmapped space * to real space and flush out to disk. @@ -1238,10 +1381,7 @@ xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) { - struct bhv_vnode *vp = vn_from_inode(mapping->host); - - if (VN_TRUNC(vp)) - VUNTRUNCATE(vp); + xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); return generic_writepages(mapping, wbc); } @@ -1276,7 +1416,7 @@ xfs_vm_releasepage( .nr_to_write = 1, }; - xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, 0); + trace_xfs_releasepage(inode, page, 0); if (!page_has_buffers(page)) return 0; @@ -1318,7 +1458,6 @@ __xfs_get_blocks( int direct, bmapi_flags_t flags) { - bhv_vnode_t *vp = vn_from_inode(inode); xfs_iomap_t iomap; xfs_off_t offset; ssize_t size; @@ -1328,7 +1467,11 @@ __xfs_get_blocks( offset = (xfs_off_t)iblock << inode->i_blkbits; ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; - error = bhv_vop_bmap(vp, offset, size, + + if (!create && direct && offset >= i_size_read(inode)) + return 0; + + error = xfs_iomap(XFS_I(inode), offset, size, create ? flags : BMAPI_READ, &iomap, &niomap); if (error) return -error; @@ -1340,10 +1483,8 @@ __xfs_get_blocks( * For unwritten extents do not report a disk address on * the read case (treat as if we're reading into a hole). */ - if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) { - xfs_map_buffer(bh_result, &iomap, offset, - inode->i_blkbits); - } + if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) + xfs_map_buffer(inode, bh_result, &iomap, offset); if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) { if (direct) bh_result->b_private = inode; @@ -1355,7 +1496,7 @@ __xfs_get_blocks( * If this is a realtime file, data may be on a different device. * to that pointed to from the buffer_head b_bdev currently. */ - bh_result->b_bdev = iomap.iomap_target->bt_bdev; + bh_result->b_bdev = xfs_find_bdev_for_inode(inode); /* * If we previously allocated a block out beyond eof and we are now @@ -1382,9 +1523,14 @@ __xfs_get_blocks( } if (direct || size > (1 << inode->i_blkbits)) { - ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0); + struct xfs_mount *mp = XFS_I(inode)->i_mount; + xfs_off_t iomap_offset = XFS_FSB_TO_B(mp, iomap.iomap_offset); + xfs_off_t iomap_delta = offset - iomap_offset; + xfs_off_t iomap_bsize = XFS_FSB_TO_B(mp, iomap.iomap_bsize); + + ASSERT(iomap_bsize - iomap_delta > 0); offset = min_t(xfs_off_t, - iomap.iomap_bsize - iomap.iomap_delta, size); + iomap_bsize - iomap_delta, size); bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset); } @@ -1454,7 +1600,7 @@ xfs_end_io_direct( * didn't map an unwritten extent so switch it's completion * handler. */ - INIT_WORK(&ioend->io_work, xfs_end_bio_written); + ioend->io_type = IOMAP_NEW; xfs_finish_ioend(ioend, 0); } @@ -1476,31 +1622,18 @@ xfs_vm_direct_IO( { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - bhv_vnode_t *vp = vn_from_inode(inode); - xfs_iomap_t iomap; - int maps = 1; - int error; + struct block_device *bdev; ssize_t ret; - error = bhv_vop_bmap(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps); - if (error) - return -error; + bdev = xfs_find_bdev_for_inode(inode); - if (rw == WRITE) { - iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); - ret = blockdev_direct_IO_own_locking(rw, iocb, inode, - iomap.iomap_target->bt_bdev, - iov, offset, nr_segs, - xfs_get_blocks_direct, - xfs_end_io_direct); - } else { - iocb->private = xfs_alloc_ioend(inode, IOMAP_READ); - ret = blockdev_direct_IO_no_locking(rw, iocb, inode, - iomap.iomap_target->bt_bdev, - iov, offset, nr_segs, - xfs_get_blocks_direct, - xfs_end_io_direct); - } + iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? + IOMAP_UNWRITTEN : IOMAP_READ); + + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, + offset, nr_segs, + xfs_get_blocks_direct, + xfs_end_io_direct); if (unlikely(ret != -EIOCBQUEUED && iocb->private)) xfs_destroy_ioend(iocb->private); @@ -1508,13 +1641,18 @@ xfs_vm_direct_IO( } STATIC int -xfs_vm_prepare_write( +xfs_vm_write_begin( struct file *file, - struct page *page, - unsigned int from, - unsigned int to) + struct address_space *mapping, + loff_t pos, + unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata) { - return block_prepare_write(page, from, to, xfs_get_blocks); + *pagep = NULL; + return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + xfs_get_blocks); } STATIC sector_t @@ -1523,12 +1661,12 @@ xfs_vm_bmap( sector_t block) { struct inode *inode = (struct inode *)mapping->host; - bhv_vnode_t *vp = vn_from_inode(inode); + struct xfs_inode *ip = XFS_I(inode); - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - bhv_vop_rwlock(vp, VRWLOCK_READ); - bhv_vop_flush_pages(vp, (xfs_off_t)0, -1, 0, FI_REMAPF); - bhv_vop_rwunlock(vp, VRWLOCK_READ); + xfs_itrace_entry(XFS_I(inode)); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); return generic_block_bmap(mapping, block, xfs_get_blocks); } @@ -1550,16 +1688,6 @@ xfs_vm_readpages( return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); } -STATIC void -xfs_vm_invalidatepage( - struct page *page, - unsigned long offset) -{ - xfs_page_trace(XFS_INVALIDPAGE_ENTER, - page->mapping->host, page, offset); - block_invalidatepage(page, offset); -} - const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, .readpages = xfs_vm_readpages, @@ -1568,9 +1696,11 @@ const struct address_space_operations xfs_address_space_operations = { .sync_page = block_sync_page, .releasepage = xfs_vm_releasepage, .invalidatepage = xfs_vm_invalidatepage, - .prepare_write = xfs_vm_prepare_write, - .commit_write = generic_commit_write, + .write_begin = xfs_vm_write_begin, + .write_end = generic_write_end, .bmap = xfs_vm_bmap, .direct_IO = xfs_vm_direct_IO, .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, };