X-Git-Url: http://ftp.safe.ca/?p=safe%2Fjmp%2Flinux-2.6;a=blobdiff_plain;f=mm%2Ffilemap.c;h=c3d3506ecabaf0a725b32d3b69ad699c5095adfa;hp=557fd887254f6cb2b891d0fd11a4c09e79283e3f;hb=af901ca181d92aac3a7dc265144a9081a86d8f39;hpb=08291429cfa6258c4cd95d8833beb40f828b194e diff --git a/mm/filemap.c b/mm/filemap.c index 557fd88..c3d3506 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -25,25 +25,24 @@ #include #include #include +#include #include #include #include #include #include -#include "filemap.h" +#include /* for BUG_ON(!in_atomic()) only */ +#include +#include /* for page_is_file_cache() */ #include "internal.h" /* * FIXME: remove all knowledge of the buffer layer from the core VM */ -#include /* for generic_osync_inode */ +#include /* for try_to_free_buffers */ #include -static ssize_t -generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs); - /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -59,7 +58,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, /* * Lock ordering: * - * ->i_mmap_lock (vmtruncate) + * ->i_mmap_lock (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock @@ -105,12 +104,16 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * * ->task->proc_lock * ->dcache_lock (proc_pid_lookup) + * + * (code doesn't rely on that order, so you could switch it around) + * ->tasklist_lock (memory_failure, collect_procs_ao) + * ->i_mmap_lock */ /* * Remove a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold a write_lock on the mapping's tree_lock. + * is safe. The caller must hold the mapping's tree_lock. */ void __remove_from_page_cache(struct page *page) { @@ -120,7 +123,21 @@ void __remove_from_page_cache(struct page *page) page->mapping = NULL; mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); + if (PageSwapBacked(page)) + __dec_zone_page_state(page, NR_SHMEM); BUG_ON(page_mapped(page)); + + /* + * Some filesystems seem to re-dirty the page even after + * the VM has canceled the dirty bit (eg ext3 journaling). + * + * Fix it up by doing a final dirty accounting check after + * having removed the page entirely. + */ + if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { + dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + } } void remove_from_page_cache(struct page *page) @@ -129,9 +146,10 @@ void remove_from_page_cache(struct page *page) BUG_ON(!PageLocked(page)); - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); } static int sync_page(void *word) @@ -170,6 +188,12 @@ static int sync_page(void *word) return 0; } +static int sync_page_killable(void *word) +{ + sync_page(word); + return fatal_signal_pending(current) ? -EINTR : 0; +} + /** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write @@ -191,7 +215,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, int ret; struct writeback_control wbc = { .sync_mode = sync_mode, - .nr_to_write = mapping->nrpages * 2, + .nr_to_write = LONG_MAX, .range_start = start, .range_end = end, }; @@ -215,11 +239,12 @@ int filemap_fdatawrite(struct address_space *mapping) } EXPORT_SYMBOL(filemap_fdatawrite); -static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, +int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } +EXPORT_SYMBOL(filemap_fdatawrite_range); /** * filemap_flush - mostly a non-blocking flush @@ -287,68 +312,24 @@ int wait_on_page_writeback_range(struct address_space *mapping, } /** - * sync_page_range - write and wait on all pages in the passed range - * @inode: target inode - * @mapping: target address_space - * @pos: beginning offset in pages to write - * @count: number of bytes to write - * - * Write and wait upon all the pages in the passed range. This is a "data - * integrity" operation. It waits upon in-flight writeout before starting and - * waiting upon new writeout. If there was an IO error, return it. + * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range + * @mapping: address space structure to wait for + * @start: offset in bytes where the range starts + * @end: offset in bytes where the range ends (inclusive) * - * We need to re-take i_mutex during the generic_osync_inode list walk because - * it is otherwise livelockable. - */ -int sync_page_range(struct inode *inode, struct address_space *mapping, - loff_t pos, loff_t count) -{ - pgoff_t start = pos >> PAGE_CACHE_SHIFT; - pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - int ret; - - if (!mapping_cap_writeback_dirty(mapping) || !count) - return 0; - ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); - if (ret == 0) { - mutex_lock(&inode->i_mutex); - ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); - mutex_unlock(&inode->i_mutex); - } - if (ret == 0) - ret = wait_on_page_writeback_range(mapping, start, end); - return ret; -} -EXPORT_SYMBOL(sync_page_range); - -/** - * sync_page_range_nolock - * @inode: target inode - * @mapping: target address_space - * @pos: beginning offset in pages to write - * @count: number of bytes to write + * Walk the list of under-writeback pages of the given address space + * in the given range and wait for all of them. * - * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea - * as it forces O_SYNC writers to different parts of the same file - * to be serialised right until io completion. + * This is just a simple wrapper so that callers don't have to convert offsets + * to page indexes themselves */ -int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, - loff_t pos, loff_t count) +int filemap_fdatawait_range(struct address_space *mapping, loff_t start, + loff_t end) { - pgoff_t start = pos >> PAGE_CACHE_SHIFT; - pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - int ret; - - if (!mapping_cap_writeback_dirty(mapping) || !count) - return 0; - ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); - if (ret == 0) - ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); - if (ret == 0) - ret = wait_on_page_writeback_range(mapping, start, end); - return ret; + return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT, + end >> PAGE_CACHE_SHIFT); } -EXPORT_SYMBOL(sync_page_range_nolock); +EXPORT_SYMBOL(filemap_fdatawait_range); /** * filemap_fdatawait - wait for all under-writeback pages to complete @@ -421,58 +402,89 @@ int filemap_write_and_wait_range(struct address_space *mapping, } return err; } +EXPORT_SYMBOL(filemap_write_and_wait_range); /** - * add_to_page_cache - add newly allocated pagecache pages + * add_to_page_cache_locked - add a locked page to the pagecache * @page: page to add * @mapping: the page's address_space * @offset: page index * @gfp_mask: page allocation mode * - * This function is used to add newly allocated pagecache pages; - * the page is new, so we can just run SetPageLocked() against it. - * The other page state flags were set by rmqueue(). - * + * This function is used to add a page to the pagecache. It must be locked. * This function does not add the page to the LRU. The caller must do that. */ -int add_to_page_cache(struct page *page, struct address_space *mapping, +int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + int error; + VM_BUG_ON(!PageLocked(page)); + + error = mem_cgroup_cache_charge(page, current->mm, + gfp_mask & GFP_RECLAIM_MASK); + if (error) + goto out; + + error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { - write_lock_irq(&mapping->tree_lock); + page_cache_get(page); + page->mapping = mapping; + page->index = offset; + + spin_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); - if (!error) { - page_cache_get(page); - SetPageLocked(page); - page->mapping = mapping; - page->index = offset; + if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); + if (PageSwapBacked(page)) + __inc_zone_page_state(page, NR_SHMEM); + spin_unlock_irq(&mapping->tree_lock); + } else { + page->mapping = NULL; + spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); + page_cache_release(page); } - write_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); - } + } else + mem_cgroup_uncharge_cache_page(page); +out: return error; } -EXPORT_SYMBOL(add_to_page_cache); +EXPORT_SYMBOL(add_to_page_cache_locked); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - int ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) - lru_cache_add(page); + int ret; + + /* + * Splice_read and readahead add shmem/tmpfs pages into the page cache + * before shmem_readpage has a chance to mark them as SwapBacked: they + * need to go on the active_anon lru below, and mem_cgroup_cache_charge + * (called in add_to_page_cache) needs to know where they're going too. + */ + if (mapping_cap_swap_backed(mapping)) + SetPageSwapBacked(page); + + ret = add_to_page_cache(page, mapping, offset, gfp_mask); + if (ret == 0) { + if (page_is_file_cache(page)) + lru_cache_add_file(page); + else + lru_cache_add_active_anon(page); + } return ret; } +EXPORT_SYMBOL_GPL(add_to_page_cache_lru); #ifdef CONFIG_NUMA struct page *__page_cache_alloc(gfp_t gfp) { if (cpuset_do_page_mem_spread()) { int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, gfp, 0); + return alloc_pages_exact_node(n, gfp, 0); } return alloc_pages(gfp, 0); } @@ -507,7 +519,7 @@ static inline void wake_up_page(struct page *page, int bit) __wake_up_bit(page_waitqueue(page), &page->flags, bit); } -void fastcall wait_on_page_bit(struct page *page, int bit_nr) +void wait_on_page_bit(struct page *page, int bit_nr) { DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); @@ -518,6 +530,24 @@ void fastcall wait_on_page_bit(struct page *page, int bit_nr) EXPORT_SYMBOL(wait_on_page_bit); /** + * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue + * @page: Page defining the wait queue of interest + * @waiter: Waiter to add to the queue + * + * Add an arbitrary @waiter to the wait queue for the nominated @page. + */ +void add_page_wait_queue(struct page *page, wait_queue_t *waiter) +{ + wait_queue_head_t *q = page_waitqueue(page); + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, waiter); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(add_page_wait_queue); + +/** * unlock_page - unlock a locked page * @page: the page * @@ -526,17 +556,14 @@ EXPORT_SYMBOL(wait_on_page_bit); * mechananism between PageLocked pages and PageWriteback pages is shared. * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. * - * The first mb is necessary to safely close the critical section opened by the - * TestSetPageLocked(), the second mb is necessary to enforce ordering between - * the clear_bit and the read of the waitqueue (to avoid SMP races with a - * parallel wait_on_page_locked()). + * The mb is necessary to enforce ordering between the clear_bit and the read + * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). */ -void fastcall unlock_page(struct page *page) +void unlock_page(struct page *page) { - smp_mb__before_clear_bit(); - if (!TestClearPageLocked(page)) - BUG(); - smp_mb__after_clear_bit(); + VM_BUG_ON(!PageLocked(page)); + clear_bit_unlock(PG_locked, &page->flags); + smp_mb__after_clear_bit(); wake_up_page(page, PG_locked); } EXPORT_SYMBOL(unlock_page); @@ -547,10 +574,12 @@ EXPORT_SYMBOL(unlock_page); */ void end_page_writeback(struct page *page) { - if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) { - if (!test_clear_page_writeback(page)) - BUG(); - } + if (TestClearPageReclaim(page)) + rotate_reclaimable_page(page); + + if (!test_clear_page_writeback(page)) + BUG(); + smp_mb__after_clear_bit(); wake_up_page(page, PG_writeback); } @@ -565,7 +594,7 @@ EXPORT_SYMBOL(end_page_writeback); * chances are that on the second loop, the block layer's plug list is empty, * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. */ -void fastcall __lock_page(struct page *page) +void __lock_page(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); @@ -574,11 +603,23 @@ void fastcall __lock_page(struct page *page) } EXPORT_SYMBOL(__lock_page); -/* +int __lock_page_killable(struct page *page) +{ + DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + + return __wait_on_bit_lock(page_waitqueue(page), &wait, + sync_page_killable, TASK_KILLABLE); +} +EXPORT_SYMBOL_GPL(__lock_page_killable); + +/** + * __lock_page_nosync - get a lock on the page, without calling sync_page() + * @page: the page to lock + * * Variant of lock_page that does not require the caller to hold a reference * on the page's mapping. */ -void fastcall __lock_page_nosync(struct page *page) +void __lock_page_nosync(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, @@ -593,15 +634,35 @@ void fastcall __lock_page_nosync(struct page *page) * Is there a pagecache struct page at the given (mapping, offset) tuple? * If yes, increment its refcount and return it; if no, return NULL. */ -struct page * find_get_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_page(struct address_space *mapping, pgoff_t offset) { + void **pagep; struct page *page; - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); - if (page) - page_cache_get(page); - read_unlock_irq(&mapping->tree_lock); + rcu_read_lock(); +repeat: + page = NULL; + pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); + if (pagep) { + page = radix_tree_deref_slot(pagep); + if (unlikely(!page || page == RADIX_TREE_RETRY)) + goto repeat; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != *pagep)) { + page_cache_release(page); + goto repeat; + } + } + rcu_read_unlock(); + return page; } EXPORT_SYMBOL(find_get_page); @@ -616,32 +677,22 @@ EXPORT_SYMBOL(find_get_page); * * Returns zero if the page was not present. find_lock_page() may sleep. */ -struct page *find_lock_page(struct address_space *mapping, - pgoff_t offset) +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) { struct page *page; repeat: - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); + page = find_get_page(mapping, offset); if (page) { - page_cache_get(page); - if (TestSetPageLocked(page)) { - read_unlock_irq(&mapping->tree_lock); - __lock_page(page); - - /* Has the page been truncated while we slept? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - page_cache_release(page); - goto repeat; - } - VM_BUG_ON(page->index != offset); - goto out; + lock_page(page); + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; } + VM_BUG_ON(page->index != offset); } - read_unlock_irq(&mapping->tree_lock); -out: return page; } EXPORT_SYMBOL(find_lock_page); @@ -674,7 +725,14 @@ repeat: page = __page_cache_alloc(gfp_mask); if (!page) return NULL; - err = add_to_page_cache_lru(page, mapping, index, gfp_mask); + /* + * We want a regular kernel memory (not highmem or DMA etc) + * allocation for the radix tree nodes, but we need to honour + * the context-specific requirements the caller has asked for. + * GFP_RECLAIM_MASK collects those requirements. + */ + err = add_to_page_cache_lru(page, mapping, index, + (gfp_mask & GFP_RECLAIM_MASK)); if (unlikely(err)) { page_cache_release(page); page = NULL; @@ -707,13 +765,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, { unsigned int i; unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, start, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, start, nr_pages); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); - read_unlock_irq(&mapping->tree_lock); + pages[ret] = page; + ret++; + } + rcu_read_unlock(); return ret; } @@ -734,19 +818,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, { unsigned int i; unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, index, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, index, nr_pages); - for (i = 0; i < ret; i++) { - if (pages[i]->mapping == NULL || pages[i]->index != index) + if (page->mapping == NULL || page->index != index) break; - page_cache_get(pages[i]); + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; index++; } - read_unlock_irq(&mapping->tree_lock); - return i; + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(find_get_pages_contig); @@ -766,15 +875,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, { unsigned int i; unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, + (void ***)pages, *index, nr_pages, tag); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; + } + rcu_read_unlock(); - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup_tag(&mapping->page_tree, - (void **)pages, *index, nr_pages, tag); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); if (ret) *index = pages[ret - 1]->index + 1; - read_unlock_irq(&mapping->tree_lock); + return ret; } EXPORT_SYMBOL(find_get_pages_tag); @@ -798,13 +935,13 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) struct page *page = find_get_page(mapping, index); if (page) { - if (!TestSetPageLocked(page)) + if (trylock_page(page)) return page; page_cache_release(page); return NULL; } page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); - if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { + if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { page_cache_release(page); page = NULL; } @@ -830,16 +967,11 @@ EXPORT_SYMBOL(grab_cache_page_nowait); static void shrink_readahead_size_eio(struct file *filp, struct file_ra_state *ra) { - if (!ra->ra_pages) - return; - ra->ra_pages /= 4; } /** - * do_generic_mapping_read - generic file read routine - * @mapping: address_space to be read - * @_ra: file's readahead state + * do_generic_file_read - generic file read routine * @filp: the file to read * @ppos: current file position * @desc: read_descriptor @@ -850,18 +982,13 @@ static void shrink_readahead_size_eio(struct file *filp, * * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. - * - * Note the struct file* is only passed for the use of readpage. - * It may be NULL. */ -void do_generic_mapping_read(struct address_space *mapping, - struct file_ra_state *ra, - struct file *filp, - loff_t *ppos, - read_descriptor_t *desc, - read_actor_t actor) +static void do_generic_file_read(struct file *filp, loff_t *ppos, + read_descriptor_t *desc, read_actor_t actor) { + struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; + struct file_ra_state *ra = &filp->f_ra; pgoff_t index; pgoff_t last_index; pgoff_t prev_index; @@ -897,8 +1024,17 @@ find_page: ra, filp, page, index, last_index - index); } - if (!PageUptodate(page)) - goto page_not_up_to_date; + if (!PageUptodate(page)) { + if (inode->i_blkbits == PAGE_CACHE_SHIFT || + !mapping->a_ops->is_partially_uptodate) + goto page_not_up_to_date; + if (!trylock_page(page)) + goto page_not_up_to_date; + if (!mapping->a_ops->is_partially_uptodate(page, + desc, offset)) + goto page_not_up_to_date_locked; + unlock_page(page); + } page_ok: /* * i_size must be checked after we know the page is Uptodate. @@ -965,8 +1101,11 @@ page_ok: page_not_up_to_date: /* Get exclusive access to the page ... */ - lock_page(page); + error = lock_page_killable(page); + if (unlikely(error)) + goto readpage_error; +page_not_up_to_date_locked: /* Did it get truncated before we got the lock? */ if (!page->mapping) { unlock_page(page); @@ -993,7 +1132,9 @@ readpage: } if (!PageUptodate(page)) { - lock_page(page); + error = lock_page_killable(page); + if (unlikely(error)) + goto readpage_error; if (!PageUptodate(page)) { if (page->mapping == NULL) { /* @@ -1004,8 +1145,8 @@ readpage: goto find_page; } unlock_page(page); - error = -EIO; shrink_readahead_size_eio(filp, ra); + error = -EIO; goto readpage_error; } unlock_page(page); @@ -1047,10 +1188,8 @@ out: ra->prev_pos |= prev_offset; *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; - if (filp) - file_accessed(filp); + file_accessed(filp); } -EXPORT_SYMBOL(do_generic_mapping_read); int file_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset, unsigned long size) @@ -1162,42 +1301,42 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, mapping = filp->f_mapping; inode = mapping->host; - retval = 0; if (!count) goto out; /* skip atime */ size = i_size_read(inode); if (pos < size) { - retval = generic_file_direct_IO(READ, iocb, - iov, pos, nr_segs); + retval = filemap_write_and_wait_range(mapping, pos, + pos + iov_length(iov, nr_segs) - 1); + if (!retval) { + retval = mapping->a_ops->direct_IO(READ, iocb, + iov, pos, nr_segs); + } if (retval > 0) *ppos = pos + retval; - } - if (likely(retval != 0)) { - file_accessed(filp); - goto out; + if (retval) { + file_accessed(filp); + goto out; + } } } - retval = 0; - if (count) { - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; + for (seg = 0; seg < nr_segs; seg++) { + read_descriptor_t desc; - desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; - if (desc.count == 0) - continue; - desc.error = 0; - do_generic_file_read(filp,ppos,&desc,file_read_actor); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; - } - if (desc.count > 0) - break; + desc.written = 0; + desc.arg.buf = iov[seg].iov_base; + desc.count = iov[seg].iov_len; + if (desc.count == 0) + continue; + desc.error = 0; + do_generic_file_read(filp, ppos, &desc, file_read_actor); + retval += desc.written; + if (desc.error) { + retval = retval ?: desc.error; + break; } + if (desc.count > 0) + break; } out: return retval; @@ -1211,12 +1350,11 @@ do_readahead(struct address_space *mapping, struct file *filp, if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) return -EINVAL; - force_page_cache_readahead(mapping, filp, index, - max_sane_readahead(nr)); + force_page_cache_readahead(mapping, filp, index, nr); return 0; } -asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) +SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) { ssize_t ret; struct file *file; @@ -1235,6 +1373,13 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) } return ret; } +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_readahead(long fd, loff_t offset, long count) +{ + return SYSC_readahead((int) fd, offset, (size_t) count); +} +SYSCALL_ALIAS(sys_readahead, SyS_readahead); +#endif #ifdef CONFIG_MMU /** @@ -1245,7 +1390,7 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. */ -static int fastcall page_cache_read(struct file * file, pgoff_t offset) +static int page_cache_read(struct file *file, pgoff_t offset) { struct address_space *mapping = file->f_mapping; struct page *page; @@ -1271,6 +1416,73 @@ static int fastcall page_cache_read(struct file * file, pgoff_t offset) #define MMAP_LOTSAMISS (100) +/* + * Synchronous readahead happens when we don't even find + * a page in the page cache at all. + */ +static void do_sync_mmap_readahead(struct vm_area_struct *vma, + struct file_ra_state *ra, + struct file *file, + pgoff_t offset) +{ + unsigned long ra_pages; + struct address_space *mapping = file->f_mapping; + + /* If we don't want any read-ahead, don't bother */ + if (VM_RandomReadHint(vma)) + return; + + if (VM_SequentialReadHint(vma) || + offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { + page_cache_sync_readahead(mapping, ra, file, offset, + ra->ra_pages); + return; + } + + if (ra->mmap_miss < INT_MAX) + ra->mmap_miss++; + + /* + * Do we miss much more than hit in this file? If so, + * stop bothering with read-ahead. It will only hurt. + */ + if (ra->mmap_miss > MMAP_LOTSAMISS) + return; + + /* + * mmap read-around + */ + ra_pages = max_sane_readahead(ra->ra_pages); + if (ra_pages) { + ra->start = max_t(long, 0, offset - ra_pages/2); + ra->size = ra_pages; + ra->async_size = 0; + ra_submit(ra, mapping, file); + } +} + +/* + * Asynchronous readahead happens when we find the page and PG_readahead, + * so we want to possibly extend the readahead further.. + */ +static void do_async_mmap_readahead(struct vm_area_struct *vma, + struct file_ra_state *ra, + struct file *file, + struct page *page, + pgoff_t offset) +{ + struct address_space *mapping = file->f_mapping; + + /* If we don't want any read-ahead, don't bother */ + if (VM_RandomReadHint(vma)) + return; + if (ra->mmap_miss > 0) + ra->mmap_miss--; + if (PageReadahead(page)) + page_cache_async_readahead(mapping, ra, file, + page, offset, ra->ra_pages); +} + /** * filemap_fault - read in file data for page fault handling * @vma: vma in which the fault was taken @@ -1290,78 +1502,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; + pgoff_t offset = vmf->pgoff; struct page *page; - unsigned long size; - int did_readaround = 0; + pgoff_t size; int ret = 0; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (vmf->pgoff >= size) - goto outside_data_content; - - /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(vma)) - goto no_cached_page; + if (offset >= size) + return VM_FAULT_SIGBUS; /* * Do we have something in the page cache already? */ -retry_find: - page = find_lock_page(mapping, vmf->pgoff); - /* - * For sequential accesses, we use the generic readahead logic. - */ - if (VM_SequentialReadHint(vma)) { - if (!page) { - page_cache_sync_readahead(mapping, ra, file, - vmf->pgoff, 1); - page = find_lock_page(mapping, vmf->pgoff); - if (!page) - goto no_cached_page; - } - if (PageReadahead(page)) { - page_cache_async_readahead(mapping, ra, file, page, - vmf->pgoff, 1); - } - } - - if (!page) { - unsigned long ra_pages; - - ra->mmap_miss++; - + page = find_get_page(mapping, offset); + if (likely(page)) { /* - * Do we miss much more than hit in this file? If so, - * stop bothering with read-ahead. It will only hurt. + * We found the page, so try async readahead before + * waiting for the lock. */ - if (ra->mmap_miss > MMAP_LOTSAMISS) - goto no_cached_page; + do_async_mmap_readahead(vma, ra, file, page, offset); + lock_page(page); - /* - * To keep the pgmajfault counter straight, we need to - * check did_readaround, as this is an inner loop. - */ - if (!did_readaround) { - ret = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - did_readaround = 1; - ra_pages = max_sane_readahead(file->f_ra.ra_pages); - if (ra_pages) { - pgoff_t start = 0; - - if (vmf->pgoff > ra_pages / 2) - start = vmf->pgoff - ra_pages / 2; - do_page_cache_readahead(mapping, file, start, ra_pages); + /* Did it get truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto no_cached_page; } - page = find_lock_page(mapping, vmf->pgoff); + } else { + /* No page in the page cache at all */ + do_sync_mmap_readahead(vma, ra, file, offset); + count_vm_event(PGMAJFAULT); + ret = VM_FAULT_MAJOR; +retry_find: + page = find_lock_page(mapping, offset); if (!page) goto no_cached_page; } - if (!did_readaround) - ra->mmap_miss--; - /* * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. @@ -1369,37 +1547,27 @@ retry_find: if (unlikely(!PageUptodate(page))) goto page_not_uptodate; - /* Must recheck i_size under page lock */ + /* + * Found the page and have a reference on it. + * We must recheck i_size under page lock. + */ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { + if (unlikely(offset >= size)) { unlock_page(page); page_cache_release(page); - goto outside_data_content; + return VM_FAULT_SIGBUS; } - /* - * Found the page and have a reference on it. - */ - mark_page_accessed(page); - ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; + ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; vmf->page = page; return ret | VM_FAULT_LOCKED; -outside_data_content: - /* - * An external ptracer can access pages that normally aren't - * accessible.. - */ - if (vma->vm_mm == current->mm) - return VM_FAULT_SIGBUS; - - /* Fall through to the non-read-ahead case */ no_cached_page: /* * We're only likely to ever get here if MADV_RANDOM is in * effect. */ - error = page_cache_read(file, vmf->pgoff); + error = page_cache_read(file, offset); /* * The page we want has now been added to the page cache. @@ -1419,12 +1587,6 @@ no_cached_page: return VM_FAULT_SIGBUS; page_not_uptodate: - /* IO error path */ - if (!did_readaround) { - ret = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - /* * Umm, take care of errors if the page isn't up-to-date. * Try to re-read it _once_. We do this synchronously, @@ -1433,6 +1595,11 @@ page_not_uptodate: */ ClearPageError(page); error = mapping->a_ops->readpage(file, page); + if (!error) { + wait_on_page_locked(page); + if (!PageUptodate(page)) + error = -EIO; + } page_cache_release(page); if (!error || error == AOP_TRUNCATED_PAGE) @@ -1444,7 +1611,7 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); -struct vm_operations_struct generic_file_vm_ops = { +const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, }; @@ -1515,9 +1682,20 @@ repeat: return page; } -/* +/** + * read_cache_page_async - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: destination for read data + * * Same as read_cache_page, but don't wait for page to become unlocked * after submitting it to the filler. + * + * Read into the page cache. If a page already exists, and PageUptodate() is + * not set, try to fill the page but don't wait for it to become unlocked. + * + * If the page does not get brought uptodate, return -EIO. */ struct page *read_cache_page_async(struct address_space *mapping, pgoff_t index, @@ -1609,14 +1787,14 @@ int should_remove_suid(struct dentry *dentry) if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) kill |= ATTR_KILL_SGID; - if (unlikely(kill && !capable(CAP_FSETID))) + if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) return kill; return 0; } EXPORT_SYMBOL(should_remove_suid); -int __remove_suid(struct dentry *dentry, int kill) +static int __remove_suid(struct dentry *dentry, int kill) { struct iattr newattrs; @@ -1624,19 +1802,25 @@ int __remove_suid(struct dentry *dentry, int kill) return notify_change(dentry, &newattrs); } -int remove_suid(struct dentry *dentry) +int file_remove_suid(struct file *file) { - int kill = should_remove_suid(dentry); - - if (unlikely(kill)) - return __remove_suid(dentry, kill); + struct dentry *dentry = file->f_path.dentry; + int killsuid = should_remove_suid(dentry); + int killpriv = security_inode_need_killpriv(dentry); + int error = 0; + + if (killpriv < 0) + return killpriv; + if (killpriv) + error = security_inode_killpriv(dentry); + if (!error && killsuid) + error = __remove_suid(dentry, killsuid); - return 0; + return error; } -EXPORT_SYMBOL(remove_suid); +EXPORT_SYMBOL(file_remove_suid); -size_t -__filemap_copy_from_user_iovec_inatomic(char *vaddr, +static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { size_t copied = 0, left = 0; @@ -1646,7 +1830,7 @@ __filemap_copy_from_user_iovec_inatomic(char *vaddr, int copy = min(bytes, iov->iov_len - base); base = 0; - left = __copy_from_user_inatomic_nocache(vaddr, buf, copy); + left = __copy_from_user_inatomic(vaddr, buf, copy); copied += copy; bytes -= copy; vaddr += copy; @@ -1659,6 +1843,125 @@ __filemap_copy_from_user_iovec_inatomic(char *vaddr, } /* + * Copy as much as we can into the page and return the number of bytes which + * were successfully copied. If a fault is encountered then return the number of + * bytes which were copied. + */ +size_t iov_iter_copy_from_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + BUG_ON(!in_atomic()); + kaddr = kmap_atomic(page, KM_USER0); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap_atomic(kaddr, KM_USER0); + + return copied; +} +EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); + +/* + * This has the same sideeffects and return value as + * iov_iter_copy_from_user_atomic(). + * The difference is that it attempts to resolve faults. + * Page must not be locked. + */ +size_t iov_iter_copy_from_user(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + kaddr = kmap(page); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap(page); + return copied; +} +EXPORT_SYMBOL(iov_iter_copy_from_user); + +void iov_iter_advance(struct iov_iter *i, size_t bytes) +{ + BUG_ON(i->count < bytes); + + if (likely(i->nr_segs == 1)) { + i->iov_offset += bytes; + i->count -= bytes; + } else { + const struct iovec *iov = i->iov; + size_t base = i->iov_offset; + + /* + * The !iov->iov_len check ensures we skip over unlikely + * zero-length segments (without overruning the iovec). + */ + while (bytes || unlikely(i->count && !iov->iov_len)) { + int copy; + + copy = min(bytes, iov->iov_len - base); + BUG_ON(!i->count || i->count < copy); + i->count -= copy; + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + base = 0; + } + } + i->iov = iov; + i->iov_offset = base; + } +} +EXPORT_SYMBOL(iov_iter_advance); + +/* + * Fault in the first iovec of the given iov_iter, to a maximum length + * of bytes. Returns 0 on success, or non-zero if the memory could not be + * accessed (ie. because it is an invalid address). + * + * writev-intensive code may want this to prefault several iovecs -- that + * would be possible (callers must not rely on the fact that _only_ the + * first iovec will be faulted with the current implementation). + */ +int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) +{ + char __user *buf = i->iov->iov_base + i->iov_offset; + bytes = min(bytes, i->iov->iov_len - i->iov_offset); + return fault_in_pages_readable(buf, bytes); +} +EXPORT_SYMBOL(iov_iter_fault_in_readable); + +/* + * Return the count of just the current iov_iter segment. + */ +size_t iov_iter_single_seg_count(struct iov_iter *i) +{ + const struct iovec *iov = i->iov; + if (i->nr_segs == 1) + return i->count; + else + return min(i->count, iov->iov_len - i->iov_offset); +} +EXPORT_SYMBOL(iov_iter_single_seg_count); + +/* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. @@ -1740,6 +2043,28 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i } EXPORT_SYMBOL(generic_write_checks); +int pagecache_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + const struct address_space_operations *aops = mapping->a_ops; + + return aops->write_begin(file, mapping, pos, len, flags, + pagep, fsdata); +} +EXPORT_SYMBOL(pagecache_write_begin); + +int pagecache_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + const struct address_space_operations *aops = mapping->a_ops; + + mark_page_accessed(page); + return aops->write_end(file, mapping, pos, len, copied, page, fsdata); +} +EXPORT_SYMBOL(pagecache_write_end); + ssize_t generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long *nr_segs, loff_t pos, loff_t *ppos, @@ -1749,11 +2074,54 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t written; + size_t write_len; + pgoff_t end; if (count != ocount) *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); - written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); + write_len = iov_length(iov, *nr_segs); + end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; + + written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); + if (written) + goto out; + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + if (mapping->nrpages) { + written = invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + /* + * If a page can not be invalidated, return 0 to fall back + * to buffered write. + */ + if (written) { + if (written == -EBUSY) + return 0; + goto out; + } + } + + written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); + + /* + * Finally, try again to invalidate clean pages which might have been + * cached by non-direct readahead, or faulted in by get_user_pages() + * if the source of the write was an mmap'ed region of the file + * we're writing. Either one is a pretty crazy thing to do, + * so we don't support it 100%. If this invalidation + * fails, tough, the write still worked... + */ + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + } + if (written > 0) { loff_t end = pos + written; if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { @@ -1762,19 +2130,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, } *ppos = end; } - - /* - * Sync the fs metadata but not the minor inode changes and - * of course not the data as we did direct DMA for the IO. - * i_mutex is held, which protects generic_osync_inode() from - * livelocking. AIO O_DIRECT ops attempt to sync metadata here. - */ - if ((written >= 0 || written == -EIOCBQUEUED) && - ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); - if (err < 0) - written = err; - } +out: return written; } EXPORT_SYMBOL(generic_file_direct_write); @@ -1783,20 +2139,24 @@ EXPORT_SYMBOL(generic_file_direct_write); * Find or create a page at the given pagecache position. Return the locked * page. This function is specifically for buffered writes. */ -static struct page *__grab_cache_page(struct address_space *mapping, - pgoff_t index) +struct page *grab_cache_page_write_begin(struct address_space *mapping, + pgoff_t index, unsigned flags) { int status; struct page *page; + gfp_t gfp_notmask = 0; + if (flags & AOP_FLAG_NOFS) + gfp_notmask = __GFP_FS; repeat: page = find_lock_page(mapping, index); if (likely(page)) return page; - page = page_cache_alloc(mapping); + page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); if (!page) return NULL; - status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); + status = add_to_page_cache_lru(page, mapping, index, + GFP_KERNEL & ~gfp_notmask); if (unlikely(status)) { page_cache_release(page); if (status == -EEXIST) @@ -1805,51 +2165,37 @@ repeat: } return page; } +EXPORT_SYMBOL(grab_cache_page_write_begin); -ssize_t -generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, loff_t *ppos, - size_t count, ssize_t written) +static ssize_t generic_perform_write(struct file *file, + struct iov_iter *i, loff_t pos) { - struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; - long status = 0; - const struct iovec *cur_iov = iov; /* current iovec */ - size_t iov_offset = 0; /* offset in the current iovec */ - char __user *buf; + long status = 0; + ssize_t written = 0; + unsigned int flags = 0; /* - * handle partial DIO write. Adjust cur_iov if needed. + * Copies from kernel address space cannot fail (NFSD is a big user). */ - filemap_set_next_iovec(&cur_iov, nr_segs, &iov_offset, written); + if (segment_eq(get_fs(), KERNEL_DS)) + flags |= AOP_FLAG_UNINTERRUPTIBLE; do { - struct page *src_page; struct page *page; pgoff_t index; /* Pagecache index for current page */ unsigned long offset; /* Offset into pagecache page */ - unsigned long seglen; /* Bytes remaining in current iovec */ unsigned long bytes; /* Bytes to write to page */ size_t copied; /* Bytes copied from user */ + void *fsdata; - buf = cur_iov->iov_base + iov_offset; offset = (pos & (PAGE_CACHE_SIZE - 1)); index = pos >> PAGE_CACHE_SHIFT; - bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) - bytes = count; - - /* - * a non-NULL src_page indicates that we're doing the - * copy via get_user_pages and kmap. - */ - src_page = NULL; + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_count(i)); - seglen = cur_iov->iov_len - iov_offset; - if (seglen > bytes) - seglen = bytes; +again: /* * Bring in the user page that we will copy from _first_. @@ -1861,149 +2207,70 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, * to check that the address is actually valid, when atomic * usercopies are used, below. */ - if (unlikely(fault_in_pages_readable(buf, seglen))) { + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { status = -EFAULT; break; } - page = __grab_cache_page(mapping, index); - if (!page) { - status = -ENOMEM; + status = a_ops->write_begin(file, mapping, pos, bytes, flags, + &page, &fsdata); + if (unlikely(status)) break; - } - /* - * non-uptodate pages cannot cope with short copies, and we - * cannot take a pagefault with the destination page locked. - * So pin the source page to copy it. - */ - if (!PageUptodate(page)) { - unlock_page(page); - - src_page = alloc_page(GFP_KERNEL); - if (!src_page) { - page_cache_release(page); - status = -ENOMEM; - break; - } - - /* - * Cannot get_user_pages with a page locked for the - * same reason as we can't take a page fault with a - * page locked (as explained below). - */ - copied = filemap_copy_from_user(src_page, offset, - cur_iov, nr_segs, iov_offset, bytes); - if (unlikely(copied == 0)) { - status = -EFAULT; - page_cache_release(page); - page_cache_release(src_page); - break; - } - bytes = copied; - - lock_page(page); - /* - * Can't handle the page going uptodate here, because - * that means we would use non-atomic usercopies, which - * zero out the tail of the page, which can cause - * zeroes to become transiently visible. We could just - * use a non-zeroing copy, but the APIs aren't too - * consistent. - */ - if (unlikely(!page->mapping || PageUptodate(page))) { - unlock_page(page); - page_cache_release(page); - page_cache_release(src_page); - continue; - } + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); + pagefault_enable(); + flush_dcache_page(page); - } + mark_page_accessed(page); + status = a_ops->write_end(file, mapping, pos, bytes, copied, + page, fsdata); + if (unlikely(status < 0)) + break; + copied = status; - status = a_ops->prepare_write(file, page, offset, offset+bytes); - if (unlikely(status)) - goto fs_write_aop_error; + cond_resched(); - if (!src_page) { + iov_iter_advance(i, copied); + if (unlikely(copied == 0)) { /* - * Must not enter the pagefault handler here, because - * we hold the page lock, so we might recursively - * deadlock on the same lock, or get an ABBA deadlock - * against a different lock, or against the mmap_sem - * (which nests outside the page lock). So increment - * preempt count, and use _atomic usercopies. + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. * - * The page is uptodate so we are OK to encounter a - * short copy: if unmodified parts of the page are - * marked dirty and written out to disk, it doesn't - * really matter. + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. */ - pagefault_disable(); - copied = filemap_copy_from_user_atomic(page, offset, - cur_iov, nr_segs, iov_offset, bytes); - pagefault_enable(); - } else { - void *src, *dst; - src = kmap_atomic(src_page, KM_USER0); - dst = kmap_atomic(page, KM_USER1); - memcpy(dst + offset, src + offset, bytes); - kunmap_atomic(dst, KM_USER1); - kunmap_atomic(src, KM_USER0); - copied = bytes; + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; } - flush_dcache_page(page); - - status = a_ops->commit_write(file, page, offset, offset+bytes); - if (unlikely(status < 0 || status == AOP_TRUNCATED_PAGE)) - goto fs_write_aop_error; - if (unlikely(status > 0)) /* filesystem did partial write */ - copied = min_t(size_t, copied, status); - - unlock_page(page); - mark_page_accessed(page); - page_cache_release(page); - if (src_page) - page_cache_release(src_page); - - written += copied; - count -= copied; pos += copied; - filemap_set_next_iovec(&cur_iov, nr_segs, &iov_offset, copied); + written += copied; balance_dirty_pages_ratelimited(mapping); - cond_resched(); - continue; -fs_write_aop_error: - if (status != AOP_TRUNCATED_PAGE) - unlock_page(page); - page_cache_release(page); - if (src_page) - page_cache_release(src_page); + } while (iov_iter_count(i)); - /* - * prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. - */ - if (pos + bytes > inode->i_size) - vmtruncate(inode, inode->i_size); - if (status == AOP_TRUNCATED_PAGE) - continue; - else - break; - } while (count); - *ppos = pos; + return written ? written : status; +} + +ssize_t +generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, loff_t *ppos, + size_t count, ssize_t written) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + ssize_t status; + struct iov_iter i; + + iov_iter_init(&i, iov, nr_segs, count, written); + status = generic_perform_write(file, &i, pos); - /* - * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC - */ if (likely(status >= 0)) { - if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - if (!a_ops->writepage || !is_sync_kiocb(iocb)) - status = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - } + written += status; + *ppos = pos + status; } /* @@ -2012,15 +2279,34 @@ fs_write_aop_error: * the file data here, to try to honour O_DIRECT expectations. */ if (unlikely(file->f_flags & O_DIRECT) && written) - status = filemap_write_and_wait(mapping); + status = filemap_write_and_wait_range(mapping, + pos, pos + written - 1); return written ? written : status; } EXPORT_SYMBOL(generic_file_buffered_write); -static ssize_t -__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) +/** + * __generic_file_aio_write - write data to a file + * @iocb: IO state structure (file, offset, etc.) + * @iov: vector with data to write + * @nr_segs: number of segments in the vector + * @ppos: position where to write + * + * This function does all the work needed for actually writing data to a + * file. It does all basic checks, removes SUID from the file, updates + * modification times and calls proper subroutines depending on whether we + * do direct IO or a standard buffered write. + * + * It expects i_mutex to be grabbed unless we work on a block device or similar + * object which does not need locking at all. + * + * This function does *not* take care of syncing data in case of O_SYNC write. + * A caller has to handle it. This is mainly due to the fact that we want to + * avoid syncing under i_mutex. + */ +ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; @@ -2052,7 +2338,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, if (count == 0) goto out; - err = remove_suid(file->f_path.dentry); + err = file_remove_suid(file); if (err) goto out; @@ -2117,121 +2403,43 @@ out: current->backing_dev_info = NULL; return written ? written : err; } +EXPORT_SYMBOL(__generic_file_aio_write); -ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, - const struct iovec *iov, unsigned long nr_segs, loff_t pos) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret; - - BUG_ON(iocb->ki_pos != pos); - - ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, - &iocb->ki_pos); - - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - ssize_t err; - - err = sync_page_range_nolock(inode, mapping, pos, ret); - if (err < 0) - ret = err; - } - return ret; -} -EXPORT_SYMBOL(generic_file_aio_write_nolock); - +/** + * generic_file_aio_write - write data to a file + * @iocb: IO state structure + * @iov: vector with data to write + * @nr_segs: number of segments in the vector + * @pos: position in file where to write + * + * This is a wrapper around __generic_file_aio_write() to be used by most + * filesystems. It takes care of syncing the file in case of O_SYNC file + * and acquires i_mutex as needed. + */ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = file->f_mapping->host; ssize_t ret; BUG_ON(iocb->ki_pos != pos); mutex_lock(&inode->i_mutex); - ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, - &iocb->ki_pos); + ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + if (ret > 0 || ret == -EIOCBQUEUED) { ssize_t err; - err = sync_page_range(inode, mapping, pos, ret); - if (err < 0) + err = generic_write_sync(file, pos, ret); + if (err < 0 && ret > 0) ret = err; } return ret; } EXPORT_SYMBOL(generic_file_aio_write); -/* - * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something - * went wrong during pagecache shootdown. - */ -static ssize_t -generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - ssize_t retval; - size_t write_len; - pgoff_t end = 0; /* silence gcc */ - - /* - * If it's a write, unmap all mmappings of the file up-front. This - * will cause any pte dirty bits to be propagated into the pageframes - * for the subsequent filemap_write_and_wait(). - */ - if (rw == WRITE) { - write_len = iov_length(iov, nr_segs); - end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT; - if (mapping_mapped(mapping)) - unmap_mapping_range(mapping, offset, write_len, 0); - } - - retval = filemap_write_and_wait(mapping); - if (retval) - goto out; - - /* - * After a write we want buffered reads to be sure to go to disk to get - * the new data. We invalidate clean cached page from the region we're - * about to write. We do this *before* the write so that we can return - * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). - */ - if (rw == WRITE && mapping->nrpages) { - retval = invalidate_inode_pages2_range(mapping, - offset >> PAGE_CACHE_SHIFT, end); - if (retval) - goto out; - } - - retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); - if (retval) - goto out; - - /* - * Finally, try again to invalidate clean pages which might have been - * faulted in by get_user_pages() if the source of the write was an - * mmap()ed region of the file we're writing. That's a pretty crazy - * thing to do, so we don't support it 100%. If this invalidation - * fails and we have -EIOCBQUEUED we ignore the failure. - */ - if (rw == WRITE && mapping->nrpages) { - int err = invalidate_inode_pages2_range(mapping, - offset >> PAGE_CACHE_SHIFT, end); - if (err && retval >= 0) - retval = err; - } -out: - return retval; -} - /** * try_to_release_page() - release old fs-specific metadata on a page * @@ -2242,10 +2450,12 @@ out: * (presumably at page->private). If the release was successful, return `1'. * Otherwise return zero. * + * This may also be called if PG_fscache is set on a page, indicating that the + * page is known to the local caching routines. + * * The @gfp_mask argument specifies whether I/O may be performed to release - * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). + * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). * - * NOTE: @gfp_mask may go away, and this function may become non-blocking. */ int try_to_release_page(struct page *page, gfp_t gfp_mask) {