X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Ffilemap.c;h=7c8643630023d239f099335f4a80480d5bde0a02;hb=3a424f2d56613acfb9e583ec9c85a2be3e3af028;hp=48497094ae836118366d14b1e9cdaf1ec9f08528;hpb=ee0b3e671baff681d69fbf0db33b47603c0a8280;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/filemap.c b/mm/filemap.c index 4849709..7c86436 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -25,12 +25,14 @@ #include #include #include +#include #include #include +#include #include #include #include -#include "filemap.h" +#include /* for BUG_ON(!in_atomic()) only */ #include "internal.h" /* @@ -63,6 +65,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock + * ->zone.lock * * ->i_mutex * ->i_mmap_lock (truncate->unmap_mapping_range) @@ -75,8 +78,8 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->mmap_sem * ->lock_page (access_process_vm) * - * ->mmap_sem - * ->i_mutex (msync) + * ->i_mutex (generic_file_buffered_write) + * ->mmap_sem (fault_in_pages_readable->do_page_fault) * * ->i_mutex * ->i_alloc_sem (various) @@ -120,6 +123,7 @@ void __remove_from_page_cache(struct page *page) page->mapping = NULL; mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); + BUG_ON(page_mapped(page)); } void remove_from_page_cache(struct page *page) @@ -327,7 +331,7 @@ EXPORT_SYMBOL(sync_page_range); * @pos: beginning offset in pages to write * @count: number of bytes to write * - * Note: Holding i_mutex across sync_page_range_nolock is not a good idea + * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea * as it forces O_SYNC writers to different parts of the same file * to be serialised right until io completion. */ @@ -467,25 +471,15 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, } #ifdef CONFIG_NUMA -struct page *page_cache_alloc(struct address_space *x) -{ - if (cpuset_do_page_mem_spread()) { - int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, mapping_gfp_mask(x), 0); - } - return alloc_pages(mapping_gfp_mask(x), 0); -} -EXPORT_SYMBOL(page_cache_alloc); - -struct page *page_cache_alloc_cold(struct address_space *x) +struct page *__page_cache_alloc(gfp_t gfp) { if (cpuset_do_page_mem_spread()) { int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0); + return alloc_pages_node(n, gfp, 0); } - return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0); + return alloc_pages(gfp, 0); } -EXPORT_SYMBOL(page_cache_alloc_cold); +EXPORT_SYMBOL(__page_cache_alloc); #endif static int __sleep_on_page_lock(void *word) @@ -602,7 +596,7 @@ void fastcall __lock_page_nosync(struct page *page) * Is there a pagecache struct page at the given (mapping, offset) tuple? * If yes, increment its refcount and return it; if no, return NULL. */ -struct page * find_get_page(struct address_space *mapping, unsigned long offset) +struct page * find_get_page(struct address_space *mapping, pgoff_t offset) { struct page *page; @@ -616,26 +610,6 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset) EXPORT_SYMBOL(find_get_page); /** - * find_trylock_page - find and lock a page - * @mapping: the address_space to search - * @offset: the page index - * - * Same as find_get_page(), but trylock it instead of incrementing the count. - */ -struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) -{ - struct page *page; - - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); - if (page && TestSetPageLocked(page)) - page = NULL; - read_unlock_irq(&mapping->tree_lock); - return page; -} -EXPORT_SYMBOL(find_trylock_page); - -/** * find_lock_page - locate, pin and lock a pagecache page * @mapping: the address_space to search * @offset: the page index @@ -646,30 +620,31 @@ EXPORT_SYMBOL(find_trylock_page); * Returns zero if the page was not present. find_lock_page() may sleep. */ struct page *find_lock_page(struct address_space *mapping, - unsigned long offset) + pgoff_t offset) { struct page *page; - read_lock_irq(&mapping->tree_lock); repeat: + read_lock_irq(&mapping->tree_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page) { page_cache_get(page); if (TestSetPageLocked(page)) { read_unlock_irq(&mapping->tree_lock); __lock_page(page); - read_lock_irq(&mapping->tree_lock); /* Has the page been truncated while we slept? */ - if (unlikely(page->mapping != mapping || - page->index != offset)) { + if (unlikely(page->mapping != mapping)) { unlock_page(page); page_cache_release(page); goto repeat; } + VM_BUG_ON(page->index != offset); + goto out; } } read_unlock_irq(&mapping->tree_lock); +out: return page; } EXPORT_SYMBOL(find_lock_page); @@ -692,28 +667,24 @@ EXPORT_SYMBOL(find_lock_page); * memory exhaustion. */ struct page *find_or_create_page(struct address_space *mapping, - unsigned long index, gfp_t gfp_mask) + pgoff_t index, gfp_t gfp_mask) { - struct page *page, *cached_page = NULL; + struct page *page; int err; repeat: page = find_lock_page(mapping, index); if (!page) { - if (!cached_page) { - cached_page = alloc_page(gfp_mask); - if (!cached_page) - return NULL; + page = __page_cache_alloc(gfp_mask); + if (!page) + return NULL; + err = add_to_page_cache_lru(page, mapping, index, gfp_mask); + if (unlikely(err)) { + page_cache_release(page); + page = NULL; + if (err == -EEXIST) + goto repeat; } - err = add_to_page_cache_lru(cached_page, mapping, - index, gfp_mask); - if (!err) { - page = cached_page; - cached_page = NULL; - } else if (err == -EEXIST) - goto repeat; } - if (cached_page) - page_cache_release(cached_page); return page; } EXPORT_SYMBOL(find_or_create_page); @@ -780,6 +751,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, read_unlock_irq(&mapping->tree_lock); return i; } +EXPORT_SYMBOL(find_get_pages_contig); /** * find_get_pages_tag - find and return pages that match @tag @@ -808,13 +780,14 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, read_unlock_irq(&mapping->tree_lock); return ret; } +EXPORT_SYMBOL(find_get_pages_tag); /** * grab_cache_page_nowait - returns locked page at given index in given cache * @mapping: target address_space * @index: the page index * - * Same as grab_cache_page, but do not wait if the page is unavailable. + * Same as grab_cache_page(), but do not wait if the page is unavailable. * This is intended for speculative data generators, where the data can * be regenerated if the page couldn't be grabbed. This routine should * be safe to call while holding the lock for another page. @@ -823,10 +796,9 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, * and deadlock against the caller's locked page. */ struct page * -grab_cache_page_nowait(struct address_space *mapping, unsigned long index) +grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) { struct page *page = find_get_page(mapping, index); - gfp_t gfp_mask; if (page) { if (!TestSetPageLocked(page)) @@ -834,9 +806,8 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index) page_cache_release(page); return NULL; } - gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS; - page = alloc_pages(gfp_mask, 0); - if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) { + page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); + if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { page_cache_release(page); page = NULL; } @@ -871,7 +842,7 @@ static void shrink_readahead_size_eio(struct file *filp, /** * do_generic_mapping_read - generic file read routine * @mapping: address_space to be read - * @_ra: file's readahead state + * @ra: file's readahead state * @filp: the file to read * @ppos: current file position * @desc: read_descriptor @@ -887,67 +858,78 @@ static void shrink_readahead_size_eio(struct file *filp, * It may be NULL. */ void do_generic_mapping_read(struct address_space *mapping, - struct file_ra_state *_ra, + struct file_ra_state *ra, struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) { struct inode *inode = mapping->host; - unsigned long index; - unsigned long end_index; - unsigned long offset; - unsigned long last_index; - unsigned long next_index; - unsigned long prev_index; - loff_t isize; - struct page *cached_page; + pgoff_t index; + pgoff_t last_index; + pgoff_t prev_index; + unsigned long offset; /* offset into pagecache page */ + unsigned int prev_offset; int error; - struct file_ra_state ra = *_ra; - cached_page = NULL; index = *ppos >> PAGE_CACHE_SHIFT; - next_index = index; - prev_index = ra.prev_page; + prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; + prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; - isize = i_size_read(inode); - if (!isize) - goto out; - - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; for (;;) { struct page *page; + pgoff_t end_index; + loff_t isize; unsigned long nr, ret; + cond_resched(); +find_page: + page = find_get_page(mapping, index); + if (!page) { + page_cache_sync_readahead(mapping, + ra, filp, + index, last_index - index); + page = find_get_page(mapping, index); + if (unlikely(page == NULL)) + goto no_cached_page; + } + if (PageReadahead(page)) { + page_cache_async_readahead(mapping, + ra, filp, page, + index, last_index - index); + } + if (!PageUptodate(page)) + goto page_not_up_to_date; +page_ok: + /* + * i_size must be checked after we know the page is Uptodate. + * + * Checking i_size after the check allows us to calculate + * the correct value for "nr", which means the zero-filled + * part of the page is not copied back to userspace (unless + * another truncate extends the file - this is desired though). + */ + + isize = i_size_read(inode); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) { + page_cache_release(page); + goto out; + } + /* nr is the maximum number of bytes to copy from this page */ nr = PAGE_CACHE_SIZE; - if (index >= end_index) { - if (index > end_index) - goto out; + if (index == end_index) { nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; if (nr <= offset) { + page_cache_release(page); goto out; } } nr = nr - offset; - cond_resched(); - if (index == next_index) - next_index = page_cache_readahead(mapping, &ra, filp, - index, last_index - index); - -find_page: - page = find_get_page(mapping, index); - if (unlikely(page == NULL)) { - handle_ra_miss(mapping, &ra, index); - goto no_cached_page; - } - if (!PageUptodate(page)) - goto page_not_up_to_date; -page_ok: - /* If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. @@ -956,10 +938,10 @@ page_ok: flush_dcache_page(page); /* - * When (part of) the same page is read multiple times - * in succession, only mark it as accessed the first time. + * When a sequential read accesses a page several times, + * only mark it as accessed the first time. */ - if (prev_index != index) + if (prev_index != index || offset != prev_offset) mark_page_accessed(page); prev_index = index; @@ -977,6 +959,7 @@ page_ok: offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; + prev_offset = offset; page_cache_release(page); if (ret == nr && desc->count) @@ -1025,37 +1008,12 @@ readpage: } unlock_page(page); error = -EIO; - shrink_readahead_size_eio(filp, &ra); + shrink_readahead_size_eio(filp, ra); goto readpage_error; } unlock_page(page); } - /* - * i_size must be checked after we have done ->readpage. - * - * Checking i_size after the readpage allows us to calculate - * the correct value for "nr", which means the zero-filled - * part of the page is not copied back to userspace (unless - * another truncate extends the file - this is desired though). - */ - isize = i_size_read(inode); - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(!isize || index > end_index)) { - page_cache_release(page); - goto out; - } - - /* nr is the maximum number of bytes to copy from this page */ - nr = PAGE_CACHE_SIZE; - if (index == end_index) { - nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; - if (nr <= offset) { - page_cache_release(page); - goto out; - } - } - nr = nr - offset; goto page_ok; readpage_error: @@ -1069,32 +1027,29 @@ no_cached_page: * Ok, it wasn't cached, so we need to create a new * page.. */ - if (!cached_page) { - cached_page = page_cache_alloc_cold(mapping); - if (!cached_page) { - desc->error = -ENOMEM; - goto out; - } + page = page_cache_alloc_cold(mapping); + if (!page) { + desc->error = -ENOMEM; + goto out; } - error = add_to_page_cache_lru(cached_page, mapping, + error = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); if (error) { + page_cache_release(page); if (error == -EEXIST) goto find_page; desc->error = error; goto out; } - page = cached_page; - cached_page = NULL; goto readpage; } out: - *_ra = ra; + ra->prev_pos = prev_index; + ra->prev_pos <<= PAGE_CACHE_SHIFT; + ra->prev_pos |= prev_offset; - *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; - if (cached_page) - page_cache_release(cached_page); + *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; if (filp) file_accessed(filp); } @@ -1138,48 +1093,73 @@ success: return size; } -/** - * __generic_file_aio_read - generic filesystem read routine - * @iocb: kernel I/O control block +/* + * Performs necessary checks before doing a write * @iov: io vector request * @nr_segs: number of segments in the iovec - * @ppos: current file position + * @count: number of bytes to write + * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE * - * This is the "read()" routine for all filesystems - * that can use the page cache directly. + * Adjust number of segments and amount of bytes to write (nr_segs should be + * properly initialized first). Returns appropriate error code that caller + * should return or zero in case that write should be allowed. */ -ssize_t -__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) +int generic_segment_checks(const struct iovec *iov, + unsigned long *nr_segs, size_t *count, int access_flags) { - struct file *filp = iocb->ki_filp; - ssize_t retval; - unsigned long seg; - size_t count; - - count = 0; - for (seg = 0; seg < nr_segs; seg++) { + unsigned long seg; + size_t cnt = 0; + for (seg = 0; seg < *nr_segs; seg++) { const struct iovec *iv = &iov[seg]; /* * If any segment has a negative length, or the cumulative * length ever wraps negative then return -EINVAL. */ - count += iv->iov_len; - if (unlikely((ssize_t)(count|iv->iov_len) < 0)) + cnt += iv->iov_len; + if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) return -EINVAL; - if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len)) + if (access_ok(access_flags, iv->iov_base, iv->iov_len)) continue; if (seg == 0) return -EFAULT; - nr_segs = seg; - count -= iv->iov_len; /* This segment is no good */ + *nr_segs = seg; + cnt -= iv->iov_len; /* This segment is no good */ break; } + *count = cnt; + return 0; +} +EXPORT_SYMBOL(generic_segment_checks); + +/** + * generic_file_aio_read - generic filesystem read routine + * @iocb: kernel I/O control block + * @iov: io vector request + * @nr_segs: number of segments in the iovec + * @pos: current file position + * + * This is the "read()" routine for all filesystems + * that can use the page cache directly. + */ +ssize_t +generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *filp = iocb->ki_filp; + ssize_t retval; + unsigned long seg; + size_t count; + loff_t *ppos = &iocb->ki_pos; + + count = 0; + retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); + if (retval) + return retval; /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { - loff_t pos = *ppos, size; + loff_t size; struct address_space *mapping; struct inode *inode; @@ -1192,13 +1172,13 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, if (pos < size) { retval = generic_file_direct_IO(READ, iocb, iov, pos, nr_segs); - if (retval > 0 && !is_sync_kiocb(iocb)) - retval = -EIOCBQUEUED; if (retval > 0) *ppos = pos + retval; } - file_accessed(filp); - goto out; + if (likely(retval != 0)) { + file_accessed(filp); + goto out; + } } retval = 0; @@ -1218,80 +1198,18 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, retval = retval ?: desc.error; break; } + if (desc.count > 0) + break; } } out: return retval; } -EXPORT_SYMBOL(__generic_file_aio_read); - -ssize_t -generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - BUG_ON(iocb->ki_pos != pos); - return __generic_file_aio_read(iocb, iov, nr_segs, &iocb->ki_pos); -} EXPORT_SYMBOL(generic_file_aio_read); -ssize_t -generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) -{ - struct iovec local_iov = { .iov_base = buf, .iov_len = count }; - struct kiocb kiocb; - ssize_t ret; - - init_sync_kiocb(&kiocb, filp); - ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); - return ret; -} -EXPORT_SYMBOL(generic_file_read); - -int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) -{ - ssize_t written; - unsigned long count = desc->count; - struct file *file = desc->arg.data; - - if (size > count) - size = count; - - written = file->f_op->sendpage(file, page, offset, - size, &file->f_pos, sizeerror = written; - written = 0; - } - desc->count = count - written; - desc->written += written; - return written; -} - -ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos, - size_t count, read_actor_t actor, void *target) -{ - read_descriptor_t desc; - - if (!count) - return 0; - - desc.written = 0; - desc.count = count; - desc.arg.data = target; - desc.error = 0; - - do_generic_file_read(in_file, ppos, &desc, actor); - if (desc.written) - return desc.written; - return desc.error; -} -EXPORT_SYMBOL(generic_file_sendfile); - static ssize_t do_readahead(struct address_space *mapping, struct file *filp, - unsigned long index, unsigned long nr) + pgoff_t index, unsigned long nr) { if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) return -EINVAL; @@ -1311,8 +1229,8 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) if (file) { if (file->f_mode & FMODE_READ) { struct address_space *mapping = file->f_mapping; - unsigned long start = offset >> PAGE_CACHE_SHIFT; - unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT; + pgoff_t start = offset >> PAGE_CACHE_SHIFT; + pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; unsigned long len = end - start + 1; ret = do_readahead(mapping, file, start, len); } @@ -1322,7 +1240,6 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) } #ifdef CONFIG_MMU -static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); /** * page_cache_read - adds requested page to the page cache if not already there * @file: file to read @@ -1331,7 +1248,7 @@ static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. */ -static int fastcall page_cache_read(struct file * file, unsigned long offset) +static int fastcall page_cache_read(struct file * file, pgoff_t offset) { struct address_space *mapping = file->f_mapping; struct page *page; @@ -1358,69 +1275,69 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset) #define MMAP_LOTSAMISS (100) /** - * filemap_nopage - read in file data for page fault handling - * @area: the applicable vm_area - * @address: target address to read in - * @type: returned with VM_FAULT_{MINOR,MAJOR} if not %NULL + * filemap_fault - read in file data for page fault handling + * @vma: vma in which the fault was taken + * @vmf: struct vm_fault containing details of the fault * - * filemap_nopage() is invoked via the vma operations vector for a + * filemap_fault() is invoked via the vma operations vector for a * mapped memory region to read in file data during a page fault. * * The goto's are kind of ugly, but this streamlines the normal case of having * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. */ -struct page *filemap_nopage(struct vm_area_struct *area, - unsigned long address, int *type) +int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int error; - struct file *file = area->vm_file; + struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; struct page *page; - unsigned long size, pgoff; - int did_readaround = 0, majmin = VM_FAULT_MINOR; - - pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + unsigned long size; + int did_readaround = 0; + int ret = 0; -retry_all: size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (pgoff >= size) + if (vmf->pgoff >= size) goto outside_data_content; /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(area)) + if (VM_RandomReadHint(vma)) goto no_cached_page; /* - * The readahead code wants to be told about each and every page - * so it can build and shrink its windows appropriately - * - * For sequential accesses, we use the generic readahead logic. - */ - if (VM_SequentialReadHint(area)) - page_cache_readahead(mapping, ra, file, pgoff, 1); - - /* * Do we have something in the page cache already? */ retry_find: - page = find_get_page(mapping, pgoff); + page = find_lock_page(mapping, vmf->pgoff); + /* + * For sequential accesses, we use the generic readahead logic. + */ + if (VM_SequentialReadHint(vma)) { + if (!page) { + page_cache_sync_readahead(mapping, ra, file, + vmf->pgoff, 1); + page = find_lock_page(mapping, vmf->pgoff); + if (!page) + goto no_cached_page; + } + if (PageReadahead(page)) { + page_cache_async_readahead(mapping, ra, file, page, + vmf->pgoff, 1); + } + } + if (!page) { unsigned long ra_pages; - if (VM_SequentialReadHint(area)) { - handle_ra_miss(mapping, ra, pgoff); - goto no_cached_page; - } ra->mmap_miss++; /* * Do we miss much more than hit in this file? If so, * stop bothering with read-ahead. It will only hurt. */ - if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS) + if (ra->mmap_miss > MMAP_LOTSAMISS) goto no_cached_page; /* @@ -1428,7 +1345,7 @@ retry_find: * check did_readaround, as this is an inner loop. */ if (!did_readaround) { - majmin = VM_FAULT_MAJOR; + ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); } did_readaround = 1; @@ -1436,49 +1353,56 @@ retry_find: if (ra_pages) { pgoff_t start = 0; - if (pgoff > ra_pages / 2) - start = pgoff - ra_pages / 2; + if (vmf->pgoff > ra_pages / 2) + start = vmf->pgoff - ra_pages / 2; do_page_cache_readahead(mapping, file, start, ra_pages); } - page = find_get_page(mapping, pgoff); + page = find_lock_page(mapping, vmf->pgoff); if (!page) goto no_cached_page; } if (!did_readaround) - ra->mmap_hit++; + ra->mmap_miss--; /* - * Ok, found a page in the page cache, now we need to check - * that it's up-to-date. + * We have a locked page in the page cache, now we need to check + * that it's up-to-date. If not, it is going to be due to an error. */ - if (!PageUptodate(page)) + if (unlikely(!PageUptodate(page))) goto page_not_uptodate; -success: + /* Must recheck i_size under page lock */ + size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + unlock_page(page); + page_cache_release(page); + goto outside_data_content; + } + /* * Found the page and have a reference on it. */ mark_page_accessed(page); - if (type) - *type = majmin; - return page; + ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; + vmf->page = page; + return ret | VM_FAULT_LOCKED; outside_data_content: /* * An external ptracer can access pages that normally aren't * accessible.. */ - if (area->vm_mm == current->mm) - return NOPAGE_SIGBUS; + if (vma->vm_mm == current->mm) + return VM_FAULT_SIGBUS; + /* Fall through to the non-read-ahead case */ no_cached_page: /* * We're only likely to ever get here if MADV_RANDOM is in * effect. */ - error = page_cache_read(file, pgoff); - grab_swap_token(); + error = page_cache_read(file, vmf->pgoff); /* * The page we want has now been added to the page cache. @@ -1494,38 +1418,15 @@ no_cached_page: * to schedule I/O. */ if (error == -ENOMEM) - return NOPAGE_OOM; - return NOPAGE_SIGBUS; + return VM_FAULT_OOM; + return VM_FAULT_SIGBUS; page_not_uptodate: + /* IO error path */ if (!did_readaround) { - majmin = VM_FAULT_MAJOR; + ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); } - lock_page(page); - - /* Did it get unhashed while we waited for it? */ - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - goto retry_all; - } - - /* Did somebody else get it up-to-date? */ - if (PageUptodate(page)) { - unlock_page(page); - goto success; - } - - error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (PageUptodate(page)) - goto success; - } else if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto retry_find; - } /* * Umm, take care of errors if the page isn't up-to-date. @@ -1533,222 +1434,26 @@ page_not_uptodate: * because there really aren't any performance issues here * and we need to check for errors. */ - lock_page(page); - - /* Somebody truncated the page on us? */ - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - goto retry_all; - } - - /* Somebody else successfully read it in? */ - if (PageUptodate(page)) { - unlock_page(page); - goto success; - } ClearPageError(page); error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (PageUptodate(page)) - goto success; - } else if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); + page_cache_release(page); + + if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; - } - /* - * Things didn't work out. Return zero to tell the - * mm layer so, possibly freeing the page cache page first. - */ + /* Things didn't work out. Return zero to tell the mm layer so. */ shrink_readahead_size_eio(file, ra); - page_cache_release(page); - return NOPAGE_SIGBUS; + return VM_FAULT_SIGBUS; } -EXPORT_SYMBOL(filemap_nopage); +EXPORT_SYMBOL(filemap_fault); + +struct vm_operations_struct generic_file_vm_ops = { + .fault = filemap_fault, +}; -static struct page * filemap_getpage(struct file *file, unsigned long pgoff, - int nonblock) -{ - struct address_space *mapping = file->f_mapping; - struct page *page; - int error; - - /* - * Do we have something in the page cache already? - */ -retry_find: - page = find_get_page(mapping, pgoff); - if (!page) { - if (nonblock) - return NULL; - goto no_cached_page; - } - - /* - * Ok, found a page in the page cache, now we need to check - * that it's up-to-date. - */ - if (!PageUptodate(page)) { - if (nonblock) { - page_cache_release(page); - return NULL; - } - goto page_not_uptodate; - } - -success: - /* - * Found the page and have a reference on it. - */ - mark_page_accessed(page); - return page; - -no_cached_page: - error = page_cache_read(file, pgoff); - - /* - * The page we want has now been added to the page cache. - * In the unlikely event that someone removed it in the - * meantime, we'll just come back here and read it again. - */ - if (error >= 0) - goto retry_find; - - /* - * An error return from page_cache_read can result if the - * system is low on memory, or a problem occurs while trying - * to schedule I/O. - */ - return NULL; - -page_not_uptodate: - lock_page(page); - - /* Did it get truncated while we waited for it? */ - if (!page->mapping) { - unlock_page(page); - goto err; - } - - /* Did somebody else get it up-to-date? */ - if (PageUptodate(page)) { - unlock_page(page); - goto success; - } - - error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (PageUptodate(page)) - goto success; - } else if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto retry_find; - } - - /* - * Umm, take care of errors if the page isn't up-to-date. - * Try to re-read it _once_. We do this synchronously, - * because there really aren't any performance issues here - * and we need to check for errors. - */ - lock_page(page); - - /* Somebody truncated the page on us? */ - if (!page->mapping) { - unlock_page(page); - goto err; - } - /* Somebody else successfully read it in? */ - if (PageUptodate(page)) { - unlock_page(page); - goto success; - } - - ClearPageError(page); - error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (PageUptodate(page)) - goto success; - } else if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto retry_find; - } - - /* - * Things didn't work out. Return zero to tell the - * mm layer so, possibly freeing the page cache page first. - */ -err: - page_cache_release(page); - - return NULL; -} - -int filemap_populate(struct vm_area_struct *vma, unsigned long addr, - unsigned long len, pgprot_t prot, unsigned long pgoff, - int nonblock) -{ - struct file *file = vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - unsigned long size; - struct mm_struct *mm = vma->vm_mm; - struct page *page; - int err; - - if (!nonblock) - force_page_cache_readahead(mapping, vma->vm_file, - pgoff, len >> PAGE_CACHE_SHIFT); - -repeat: - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (pgoff + (len >> PAGE_CACHE_SHIFT) > size) - return -EINVAL; - - page = filemap_getpage(file, pgoff, nonblock); - - /* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as - * done in shmem_populate calling shmem_getpage */ - if (!page && !nonblock) - return -ENOMEM; - - if (page) { - err = install_page(mm, vma, addr, page, prot); - if (err) { - page_cache_release(page); - return err; - } - } else if (vma->vm_flags & VM_NONLINEAR) { - /* No page was found just because we can't read it in now (being - * here implies nonblock != 0), but the page may exist, so set - * the PTE to fault it in later. */ - err = install_file_pte(mm, vma, addr, pgoff, prot); - if (err) - return err; - } - - len -= PAGE_SIZE; - addr += PAGE_SIZE; - pgoff++; - if (len) - goto repeat; - - return 0; -} -EXPORT_SYMBOL(filemap_populate); - -struct vm_operations_struct generic_file_vm_ops = { - .nopage = filemap_nopage, - .populate = filemap_populate, -}; - -/* This is used for a general mmap of a disk file */ - -int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +/* This is used for a general mmap of a disk file */ + +int generic_file_mmap(struct file * file, struct vm_area_struct * vma) { struct address_space *mapping = file->f_mapping; @@ -1756,6 +1461,7 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &generic_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } @@ -1782,55 +1488,42 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); -static inline struct page *__read_cache_page(struct address_space *mapping, - unsigned long index, +static struct page *__read_cache_page(struct address_space *mapping, + pgoff_t index, int (*filler)(void *,struct page*), void *data) { - struct page *page, *cached_page = NULL; + struct page *page; int err; repeat: page = find_get_page(mapping, index); if (!page) { - if (!cached_page) { - cached_page = page_cache_alloc_cold(mapping); - if (!cached_page) - return ERR_PTR(-ENOMEM); - } - err = add_to_page_cache_lru(cached_page, mapping, - index, GFP_KERNEL); - if (err == -EEXIST) - goto repeat; - if (err < 0) { + page = page_cache_alloc_cold(mapping); + if (!page) + return ERR_PTR(-ENOMEM); + err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); + if (unlikely(err)) { + page_cache_release(page); + if (err == -EEXIST) + goto repeat; /* Presumably ENOMEM for radix tree node */ - page_cache_release(cached_page); return ERR_PTR(err); } - page = cached_page; - cached_page = NULL; err = filler(data, page); if (err < 0) { page_cache_release(page); page = ERR_PTR(err); } } - if (cached_page) - page_cache_release(cached_page); return page; } -/** - * read_cache_page - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: destination for read data - * - * Read into the page cache. If a page already exists, - * and PageUptodate() is not set, try to fill the page. +/* + * Same as read_cache_page, but don't wait for page to become unlocked + * after submitting it to the filler. */ -struct page *read_cache_page(struct address_space *mapping, - unsigned long index, +struct page *read_cache_page_async(struct address_space *mapping, + pgoff_t index, int (*filler)(void *,struct page*), void *data) { @@ -1840,8 +1533,7 @@ struct page *read_cache_page(struct address_space *mapping, retry: page = __read_cache_page(mapping, index, filler, data); if (IS_ERR(page)) - goto out; - mark_page_accessed(page); + return page; if (PageUptodate(page)) goto out; @@ -1858,46 +1550,45 @@ retry: err = filler(data, page); if (err < 0) { page_cache_release(page); - page = ERR_PTR(err); + return ERR_PTR(err); } - out: +out: + mark_page_accessed(page); return page; } -EXPORT_SYMBOL(read_cache_page); +EXPORT_SYMBOL(read_cache_page_async); -/* - * If the page was newly created, increment its refcount and add it to the - * caller's lru-buffering pagevec. This function is specifically for - * generic_file_write(). +/** + * read_cache_page - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: destination for read data + * + * Read into the page cache. If a page already exists, and PageUptodate() is + * not set, try to fill the page then wait for it to become unlocked. + * + * If the page does not get brought uptodate, return -EIO. */ -static inline struct page * -__grab_cache_page(struct address_space *mapping, unsigned long index, - struct page **cached_page, struct pagevec *lru_pvec) +struct page *read_cache_page(struct address_space *mapping, + pgoff_t index, + int (*filler)(void *,struct page*), + void *data) { - int err; struct page *page; -repeat: - page = find_lock_page(mapping, index); - if (!page) { - if (!*cached_page) { - *cached_page = page_cache_alloc(mapping); - if (!*cached_page) - return NULL; - } - err = add_to_page_cache(*cached_page, mapping, - index, GFP_KERNEL); - if (err == -EEXIST) - goto repeat; - if (err == 0) { - page = *cached_page; - page_cache_get(page); - if (!pagevec_add(lru_pvec, page)) - __pagevec_lru_add(lru_pvec); - *cached_page = NULL; - } + + page = read_cache_page_async(mapping, index, filler, data); + if (IS_ERR(page)) + goto out; + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); } + out: return page; } +EXPORT_SYMBOL(read_cache_page); /* * The logic we want is @@ -1905,11 +1596,10 @@ repeat: * if suid or (sgid and xgrp) * remove privs */ -int remove_suid(struct dentry *dentry) +int should_remove_suid(struct dentry *dentry) { mode_t mode = dentry->d_inode->i_mode; int kill = 0; - int result = 0; /* suid always must be killed */ if (unlikely(mode & S_ISUID)) @@ -1922,18 +1612,39 @@ int remove_suid(struct dentry *dentry) if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) kill |= ATTR_KILL_SGID; - if (unlikely(kill && !capable(CAP_FSETID))) { - struct iattr newattrs; + if (unlikely(kill && !capable(CAP_FSETID))) + return kill; - newattrs.ia_valid = ATTR_FORCE | kill; - result = notify_change(dentry, &newattrs); - } - return result; + return 0; +} +EXPORT_SYMBOL(should_remove_suid); + +int __remove_suid(struct dentry *dentry, int kill) +{ + struct iattr newattrs; + + newattrs.ia_valid = ATTR_FORCE | kill; + return notify_change(dentry, &newattrs); +} + +int remove_suid(struct dentry *dentry) +{ + int killsuid = should_remove_suid(dentry); + int killpriv = security_inode_need_killpriv(dentry); + int error = 0; + + if (killpriv < 0) + return killpriv; + if (killpriv) + error = security_inode_killpriv(dentry); + if (!error && killsuid) + error = __remove_suid(dentry, killsuid); + + return error; } EXPORT_SYMBOL(remove_suid); -size_t -__filemap_copy_from_user_iovec_inatomic(char *vaddr, +static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { size_t copied = 0, left = 0; @@ -1956,6 +1667,124 @@ __filemap_copy_from_user_iovec_inatomic(char *vaddr, } /* + * Copy as much as we can into the page and return the number of bytes which + * were sucessfully copied. If a fault is encountered then return the number of + * bytes which were copied. + */ +size_t iov_iter_copy_from_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + BUG_ON(!in_atomic()); + kaddr = kmap_atomic(page, KM_USER0); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user_inatomic_nocache(kaddr + offset, + buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap_atomic(kaddr, KM_USER0); + + return copied; +} +EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); + +/* + * This has the same sideeffects and return value as + * iov_iter_copy_from_user_atomic(). + * The difference is that it attempts to resolve faults. + * Page must not be locked. + */ +size_t iov_iter_copy_from_user(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + kaddr = kmap(page); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user_nocache(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap(page); + return copied; +} +EXPORT_SYMBOL(iov_iter_copy_from_user); + +static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes) +{ + if (likely(i->nr_segs == 1)) { + i->iov_offset += bytes; + } else { + const struct iovec *iov = i->iov; + size_t base = i->iov_offset; + + while (bytes) { + int copy = min(bytes, iov->iov_len - base); + + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + base = 0; + } + } + i->iov = iov; + i->iov_offset = base; + } +} + +void iov_iter_advance(struct iov_iter *i, size_t bytes) +{ + BUG_ON(i->count < bytes); + + __iov_iter_advance_iov(i, bytes); + i->count -= bytes; +} +EXPORT_SYMBOL(iov_iter_advance); + +/* + * Fault in the first iovec of the given iov_iter, to a maximum length + * of bytes. Returns 0 on success, or non-zero if the memory could not be + * accessed (ie. because it is an invalid address). + * + * writev-intensive code may want this to prefault several iovecs -- that + * would be possible (callers must not rely on the fact that _only_ the + * first iovec will be faulted with the current implementation). + */ +int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) +{ + char __user *buf = i->iov->iov_base + i->iov_offset; + bytes = min(bytes, i->iov->iov_len - i->iov_offset); + return fault_in_pages_readable(buf, bytes); +} +EXPORT_SYMBOL(iov_iter_fault_in_readable); + +/* + * Return the count of just the current iov_iter segment. + */ +size_t iov_iter_single_seg_count(struct iov_iter *i) +{ + const struct iovec *iov = i->iov; + if (i->nr_segs == 1) + return i->count; + else + return min(i->count, iov->iov_len - i->iov_offset); +} +EXPORT_SYMBOL(iov_iter_single_seg_count); + +/* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. @@ -1992,7 +1821,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i if (unlikely(*pos + *count > MAX_NON_LFS && !(file->f_flags & O_LARGEFILE))) { if (*pos >= MAX_NON_LFS) { - send_sig(SIGXFSZ, current, 0); return -EFBIG; } if (*count > MAX_NON_LFS - (unsigned long)*pos) { @@ -2010,7 +1838,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i if (likely(!isblk)) { if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { if (*count || *pos > inode->i_sb->s_maxbytes) { - send_sig(SIGXFSZ, current, 0); return -EFBIG; } /* zero-length writes at ->s_maxbytes are OK */ @@ -2039,6 +1866,91 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i } EXPORT_SYMBOL(generic_write_checks); +int pagecache_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + const struct address_space_operations *aops = mapping->a_ops; + + if (aops->write_begin) { + return aops->write_begin(file, mapping, pos, len, flags, + pagep, fsdata); + } else { + int ret; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + struct inode *inode = mapping->host; + struct page *page; +again: + page = __grab_cache_page(mapping, index); + *pagep = page; + if (!page) + return -ENOMEM; + + if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) { + /* + * There is no way to resolve a short write situation + * for a !Uptodate page (except by double copying in + * the caller done by generic_perform_write_2copy). + * + * Instead, we have to bring it uptodate here. + */ + ret = aops->readpage(file, page); + page_cache_release(page); + if (ret) { + if (ret == AOP_TRUNCATED_PAGE) + goto again; + return ret; + } + goto again; + } + + ret = aops->prepare_write(file, page, offset, offset+len); + if (ret) { + unlock_page(page); + page_cache_release(page); + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); + } + return ret; + } +} +EXPORT_SYMBOL(pagecache_write_begin); + +int pagecache_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + const struct address_space_operations *aops = mapping->a_ops; + int ret; + + if (aops->write_end) { + mark_page_accessed(page); + ret = aops->write_end(file, mapping, pos, len, copied, + page, fsdata); + } else { + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + struct inode *inode = mapping->host; + + flush_dcache_page(page); + ret = aops->commit_write(file, page, offset, offset+len); + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + + if (ret < 0) { + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); + } else if (ret > 0) + ret = min_t(size_t, copied, ret); + else + ret = copied; + } + + return ret; +} +EXPORT_SYMBOL(pagecache_write_end); + ssize_t generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long *nr_segs, loff_t pos, loff_t *ppos, @@ -2066,158 +1978,326 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, * Sync the fs metadata but not the minor inode changes and * of course not the data as we did direct DMA for the IO. * i_mutex is held, which protects generic_osync_inode() from - * livelocking. + * livelocking. AIO O_DIRECT ops attempt to sync metadata here. */ - if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + if ((written >= 0 || written == -EIOCBQUEUED) && + ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); if (err < 0) written = err; } - if (written == count && !is_sync_kiocb(iocb)) - written = -EIOCBQUEUED; return written; } EXPORT_SYMBOL(generic_file_direct_write); -ssize_t -generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, loff_t *ppos, - size_t count, ssize_t written) +/* + * Find or create a page at the given pagecache position. Return the locked + * page. This function is specifically for buffered writes. + */ +struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) { - struct file *file = iocb->ki_filp; - struct address_space * mapping = file->f_mapping; - const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; - long status = 0; - struct page *page; - struct page *cached_page = NULL; - size_t bytes; - struct pagevec lru_pvec; - const struct iovec *cur_iov = iov; /* current iovec */ - size_t iov_base = 0; /* offset in the current iovec */ - char __user *buf; - - pagevec_init(&lru_pvec, 0); + int status; + struct page *page; +repeat: + page = find_lock_page(mapping, index); + if (likely(page)) + return page; - /* - * handle partial DIO write. Adjust cur_iov if needed. - */ - if (likely(nr_segs == 1)) - buf = iov->iov_base + written; - else { - filemap_set_next_iovec(&cur_iov, &iov_base, written); - buf = cur_iov->iov_base + iov_base; + page = page_cache_alloc(mapping); + if (!page) + return NULL; + status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); + if (unlikely(status)) { + page_cache_release(page); + if (status == -EEXIST) + goto repeat; + return NULL; } + return page; +} +EXPORT_SYMBOL(__grab_cache_page); + +static ssize_t generic_perform_write_2copy(struct file *file, + struct iov_iter *i, loff_t pos) +{ + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + struct inode *inode = mapping->host; + long status = 0; + ssize_t written = 0; do { - unsigned long index; - unsigned long offset; - size_t copied; + struct page *src_page; + struct page *page; + pgoff_t index; /* Pagecache index for current page */ + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + offset = (pos & (PAGE_CACHE_SIZE - 1)); index = pos >> PAGE_CACHE_SHIFT; - bytes = PAGE_CACHE_SIZE - offset; - - /* Limit the size of the copy to the caller's write size */ - bytes = min(bytes, count); + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_count(i)); /* - * Limit the size of the copy to that of the current segment, - * because fault_in_pages_readable() doesn't know how to walk - * segments. + * a non-NULL src_page indicates that we're doing the + * copy via get_user_pages and kmap. */ - bytes = min(bytes, cur_iov->iov_len - iov_base); + src_page = NULL; /* * Bring in the user page that we will copy from _first_. * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. */ - fault_in_pages_readable(buf, bytes); + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + status = -EFAULT; + break; + } - page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); + page = __grab_cache_page(mapping, index); if (!page) { status = -ENOMEM; break; } - if (unlikely(bytes == 0)) { - status = 0; - copied = 0; - goto zero_length_segment; - } + /* + * non-uptodate pages cannot cope with short copies, and we + * cannot take a pagefault with the destination page locked. + * So pin the source page to copy it. + */ + if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) { + unlock_page(page); - status = a_ops->prepare_write(file, page, offset, offset+bytes); - if (unlikely(status)) { - loff_t isize = i_size_read(inode); + src_page = alloc_page(GFP_KERNEL); + if (!src_page) { + page_cache_release(page); + status = -ENOMEM; + break; + } - if (status != AOP_TRUNCATED_PAGE) + /* + * Cannot get_user_pages with a page locked for the + * same reason as we can't take a page fault with a + * page locked (as explained below). + */ + copied = iov_iter_copy_from_user(src_page, i, + offset, bytes); + if (unlikely(copied == 0)) { + status = -EFAULT; + page_cache_release(page); + page_cache_release(src_page); + break; + } + bytes = copied; + + lock_page(page); + /* + * Can't handle the page going uptodate here, because + * that means we would use non-atomic usercopies, which + * zero out the tail of the page, which can cause + * zeroes to become transiently visible. We could just + * use a non-zeroing copy, but the APIs aren't too + * consistent. + */ + if (unlikely(!page->mapping || PageUptodate(page))) { unlock_page(page); - page_cache_release(page); - if (status == AOP_TRUNCATED_PAGE) + page_cache_release(page); + page_cache_release(src_page); continue; + } + } + + status = a_ops->prepare_write(file, page, offset, offset+bytes); + if (unlikely(status)) + goto fs_write_aop_error; + + if (!src_page) { /* - * prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. + * Must not enter the pagefault handler here, because + * we hold the page lock, so we might recursively + * deadlock on the same lock, or get an ABBA deadlock + * against a different lock, or against the mmap_sem + * (which nests outside the page lock). So increment + * preempt count, and use _atomic usercopies. + * + * The page is uptodate so we are OK to encounter a + * short copy: if unmodified parts of the page are + * marked dirty and written out to disk, it doesn't + * really matter. */ - if (pos + bytes > isize) - vmtruncate(inode, isize); - break; + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, + offset, bytes); + pagefault_enable(); + } else { + void *src, *dst; + src = kmap_atomic(src_page, KM_USER0); + dst = kmap_atomic(page, KM_USER1); + memcpy(dst + offset, src + offset, bytes); + kunmap_atomic(dst, KM_USER1); + kunmap_atomic(src, KM_USER0); + copied = bytes; } - if (likely(nr_segs == 1)) - copied = filemap_copy_from_user(page, offset, - buf, bytes); - else - copied = filemap_copy_from_user_iovec(page, offset, - cur_iov, iov_base, bytes); flush_dcache_page(page); + status = a_ops->commit_write(file, page, offset, offset+bytes); - if (status == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - continue; - } -zero_length_segment: - if (likely(copied >= 0)) { - if (!status) - status = copied; - - if (status >= 0) { - written += status; - count -= status; - pos += status; - buf += status; - if (unlikely(nr_segs > 1)) { - filemap_set_next_iovec(&cur_iov, - &iov_base, status); - if (count) - buf = cur_iov->iov_base + - iov_base; - } else { - iov_base += status; - } - } - } - if (unlikely(copied != bytes)) - if (status >= 0) - status = -EFAULT; + if (unlikely(status < 0)) + goto fs_write_aop_error; + if (unlikely(status > 0)) /* filesystem did partial write */ + copied = min_t(size_t, copied, status); + unlock_page(page); mark_page_accessed(page); page_cache_release(page); - if (status < 0) - break; + if (src_page) + page_cache_release(src_page); + + iov_iter_advance(i, copied); + pos += copied; + written += copied; + balance_dirty_pages_ratelimited(mapping); cond_resched(); - } while (count); - *ppos = pos; + continue; + +fs_write_aop_error: + unlock_page(page); + page_cache_release(page); + if (src_page) + page_cache_release(src_page); - if (cached_page) - page_cache_release(cached_page); + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + bytes > inode->i_size) + vmtruncate(inode, inode->i_size); + break; + } while (iov_iter_count(i)); + + return written ? written : status; +} + +static ssize_t generic_perform_write(struct file *file, + struct iov_iter *i, loff_t pos) +{ + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + long status = 0; + ssize_t written = 0; + unsigned int flags = 0; /* - * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC + * Copies from kernel address space cannot fail (NFSD is a big user). */ + if (segment_eq(get_fs(), KERNEL_DS)) + flags |= AOP_FLAG_UNINTERRUPTIBLE; + + do { + struct page *page; + pgoff_t index; /* Pagecache index for current page */ + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ + void *fsdata; + + offset = (pos & (PAGE_CACHE_SIZE - 1)); + index = pos >> PAGE_CACHE_SHIFT; + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_count(i)); + +again: + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + status = -EFAULT; + break; + } + + status = a_ops->write_begin(file, mapping, pos, bytes, flags, + &page, &fsdata); + if (unlikely(status)) + break; + + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); + pagefault_enable(); + flush_dcache_page(page); + + status = a_ops->write_end(file, mapping, pos, bytes, copied, + page, fsdata); + if (unlikely(status < 0)) + break; + copied = status; + + cond_resched(); + + if (unlikely(copied == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + iov_iter_advance(i, copied); + pos += copied; + written += copied; + + balance_dirty_pages_ratelimited(mapping); + + } while (iov_iter_count(i)); + + return written ? written : status; +} + +ssize_t +generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, loff_t *ppos, + size_t count, ssize_t written) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + struct inode *inode = mapping->host; + ssize_t status; + struct iov_iter i; + + iov_iter_init(&i, iov, nr_segs, count, written); + if (a_ops->write_begin) + status = generic_perform_write(file, &i, pos); + else + status = generic_perform_write_2copy(file, &i, pos); + if (likely(status >= 0)) { + written += status; + *ppos = pos + status; + + /* + * For now, when the user asks for O_SYNC, we'll actually give + * O_DSYNC + */ if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { if (!a_ops->writepage || !is_sync_kiocb(iocb)) status = generic_osync_inode(inode, mapping, @@ -2233,7 +2313,6 @@ zero_length_segment: if (unlikely(file->f_flags & O_DIRECT) && written) status = filemap_write_and_wait(mapping); - pagevec_lru_add(&lru_pvec); return written ? written : status; } EXPORT_SYMBOL(generic_file_buffered_write); @@ -2243,34 +2322,18 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) { struct file *file = iocb->ki_filp; - const struct address_space * mapping = file->f_mapping; + struct address_space * mapping = file->f_mapping; size_t ocount; /* original count */ size_t count; /* after file limit checks */ struct inode *inode = mapping->host; - unsigned long seg; loff_t pos; ssize_t written; ssize_t err; ocount = 0; - for (seg = 0; seg < nr_segs; seg++) { - const struct iovec *iv = &iov[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - ocount += iv->iov_len; - if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) - return -EINVAL; - if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) - continue; - if (seg == 0) - return -EFAULT; - nr_segs = seg; - ocount -= iv->iov_len; /* This segment is no good */ - break; - } + err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); + if (err) + return err; count = ocount; pos = *ppos; @@ -2288,7 +2351,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, if (count == 0) goto out; - err = remove_suid(file->f_dentry); + err = remove_suid(file->f_path.dentry); if (err) goto out; @@ -2296,8 +2359,11 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (unlikely(file->f_flags & O_DIRECT)) { - written = generic_file_direct_write(iocb, iov, - &nr_segs, pos, ppos, count, ocount); + loff_t endbyte; + ssize_t written_buffered; + + written = generic_file_direct_write(iocb, iov, &nr_segs, pos, + ppos, count, ocount); if (written < 0 || written == count) goto out; /* @@ -2306,10 +2372,46 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, */ pos += written; count -= written; - } + written_buffered = generic_file_buffered_write(iocb, iov, + nr_segs, pos, ppos, count, + written); + /* + * If generic_file_buffered_write() retuned a synchronous error + * then we want to return the number of bytes which were + * direct-written, or the error code if that was zero. Note + * that this differs from normal direct-io semantics, which + * will return -EFOO even if some bytes were written. + */ + if (written_buffered < 0) { + err = written_buffered; + goto out; + } - written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, ppos, count, written); + /* + * We need to ensure that the page cache pages are written to + * disk and invalidated to preserve the expected O_DIRECT + * semantics. + */ + endbyte = pos + written_buffered - written - 1; + err = do_sync_mapping_range(file->f_mapping, pos, endbyte, + SYNC_FILE_RANGE_WAIT_BEFORE| + SYNC_FILE_RANGE_WRITE| + SYNC_FILE_RANGE_WAIT_AFTER); + if (err == 0) { + written = written_buffered; + invalidate_mapping_pages(mapping, + pos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); + } else { + /* + * We don't know how much we wrote, so just return + * the number of bytes which were direct-written + */ + } + } else { + written = generic_file_buffered_write(iocb, iov, nr_segs, + pos, ppos, count, written); + } out: current->backing_dev_info = NULL; return written ? written : err; @@ -2339,38 +2441,6 @@ ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, } EXPORT_SYMBOL(generic_file_aio_write_nolock); -static ssize_t -__generic_file_write_nolock(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) -{ - struct kiocb kiocb; - ssize_t ret; - - init_sync_kiocb(&kiocb, file); - kiocb.ki_pos = *ppos; - ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); - return ret; -} - -ssize_t -generic_file_write_nolock(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) -{ - struct kiocb kiocb; - ssize_t ret; - - init_sync_kiocb(&kiocb, file); - kiocb.ki_pos = *ppos; - ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, *ppos); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); - *ppos = kiocb.ki_pos; - return ret; -} -EXPORT_SYMBOL(generic_file_write_nolock); - ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { @@ -2397,30 +2467,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, } EXPORT_SYMBOL(generic_file_aio_write); -ssize_t generic_file_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret; - struct iovec local_iov = { .iov_base = (void __user *)buf, - .iov_len = count }; - - mutex_lock(&inode->i_mutex); - ret = __generic_file_write_nolock(file, &local_iov, 1, ppos); - mutex_unlock(&inode->i_mutex); - - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - ssize_t err; - - err = sync_page_range(inode, mapping, *ppos - ret, ret); - if (err < 0) - ret = err; - } - return ret; -} -EXPORT_SYMBOL(generic_file_write); - /* * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something * went wrong during pagecache shootdown. @@ -2432,7 +2478,8 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; ssize_t retval; - size_t write_len = 0; + size_t write_len; + pgoff_t end = 0; /* silence gcc */ /* * If it's a write, unmap all mmappings of the file up-front. This @@ -2441,23 +2488,46 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, */ if (rw == WRITE) { write_len = iov_length(iov, nr_segs); + end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT; if (mapping_mapped(mapping)) unmap_mapping_range(mapping, offset, write_len, 0); } retval = filemap_write_and_wait(mapping); - if (retval == 0) { - retval = mapping->a_ops->direct_IO(rw, iocb, iov, - offset, nr_segs); - if (rw == WRITE && mapping->nrpages) { - pgoff_t end = (offset + write_len - 1) - >> PAGE_CACHE_SHIFT; - int err = invalidate_inode_pages2_range(mapping, + if (retval) + goto out; + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + if (rw == WRITE && mapping->nrpages) { + retval = invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end); - if (err) - retval = err; - } + if (retval) + goto out; } + + retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); + if (retval) + goto out; + + /* + * Finally, try again to invalidate clean pages which might have been + * faulted in by get_user_pages() if the source of the write was an + * mmap()ed region of the file we're writing. That's a pretty crazy + * thing to do, so we don't support it 100%. If this invalidation + * fails and we have -EIOCBQUEUED we ignore the failure. + */ + if (rw == WRITE && mapping->nrpages) { + int err = invalidate_inode_pages2_range(mapping, + offset >> PAGE_CACHE_SHIFT, end); + if (err && retval >= 0) + retval = err; + } +out: return retval; }