X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Ffilemap.c;h=1e6a7d34874fd31f3e9549c7540a4ce28c2402c6;hb=cc1a9d86ce989083703c4bdc11b75a87e1cc404a;hp=3c97bdc74a8556bba388b6c2b2e9fa929b77f22b;hpb=7ff81078d8b9f3d05a27b7bd3786ffb1ef1b0d1f;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/filemap.c b/mm/filemap.c index 3c97bdc..1e6a7d3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -25,12 +25,14 @@ #include #include #include +#include #include #include #include #include #include -#include "filemap.h" +#include /* for BUG_ON(!in_atomic()) only */ +#include #include "internal.h" /* @@ -116,11 +118,24 @@ void __remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + mem_cgroup_uncharge_page(page); radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); BUG_ON(page_mapped(page)); + + /* + * Some filesystems seem to re-dirty the page even after + * the VM has canceled the dirty bit (eg ext3 journaling). + * + * Fix it up by doing a final dirty accounting check after + * having removed the page entirely. + */ + if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { + dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + } } void remove_from_page_cache(struct page *page) @@ -170,6 +185,12 @@ static int sync_page(void *word) return 0; } +static int sync_page_killable(void *word) +{ + sync_page(word); + return fatal_signal_pending(current) ? -EINTR : 0; +} + /** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write @@ -322,7 +343,7 @@ int sync_page_range(struct inode *inode, struct address_space *mapping, EXPORT_SYMBOL(sync_page_range); /** - * sync_page_range_nolock + * sync_page_range_nolock - write & wait on all pages in the passed range without locking * @inode: target inode * @mapping: target address_space * @pos: beginning offset in pages to write @@ -438,8 +459,12 @@ int filemap_write_and_wait_range(struct address_space *mapping, int add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + int error = mem_cgroup_cache_charge(page, current->mm, + gfp_mask & ~__GFP_HIGHMEM); + if (error) + goto out; + error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { write_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); @@ -450,10 +475,14 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, page->index = offset; mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); - } + } else + mem_cgroup_uncharge_page(page); + write_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); - } + } else + mem_cgroup_uncharge_page(page); +out: return error; } EXPORT_SYMBOL(add_to_page_cache); @@ -507,7 +536,7 @@ static inline void wake_up_page(struct page *page, int bit) __wake_up_bit(page_waitqueue(page), &page->flags, bit); } -void fastcall wait_on_page_bit(struct page *page, int bit_nr) +void wait_on_page_bit(struct page *page, int bit_nr) { DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); @@ -531,7 +560,7 @@ EXPORT_SYMBOL(wait_on_page_bit); * the clear_bit and the read of the waitqueue (to avoid SMP races with a * parallel wait_on_page_locked()). */ -void fastcall unlock_page(struct page *page) +void unlock_page(struct page *page) { smp_mb__before_clear_bit(); if (!TestClearPageLocked(page)) @@ -547,10 +576,12 @@ EXPORT_SYMBOL(unlock_page); */ void end_page_writeback(struct page *page) { - if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) { - if (!test_clear_page_writeback(page)) - BUG(); - } + if (TestClearPageReclaim(page)) + rotate_reclaimable_page(page); + + if (!test_clear_page_writeback(page)) + BUG(); + smp_mb__after_clear_bit(); wake_up_page(page, PG_writeback); } @@ -565,7 +596,7 @@ EXPORT_SYMBOL(end_page_writeback); * chances are that on the second loop, the block layer's plug list is empty, * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. */ -void fastcall __lock_page(struct page *page) +void __lock_page(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); @@ -574,11 +605,22 @@ void fastcall __lock_page(struct page *page) } EXPORT_SYMBOL(__lock_page); -/* +int __lock_page_killable(struct page *page) +{ + DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + + return __wait_on_bit_lock(page_waitqueue(page), &wait, + sync_page_killable, TASK_KILLABLE); +} + +/** + * __lock_page_nosync - get a lock on the page, without calling sync_page() + * @page: the page to lock + * * Variant of lock_page that does not require the caller to hold a reference * on the page's mapping. */ -void fastcall __lock_page_nosync(struct page *page) +void __lock_page_nosync(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, @@ -593,7 +635,7 @@ void fastcall __lock_page_nosync(struct page *page) * Is there a pagecache struct page at the given (mapping, offset) tuple? * If yes, increment its refcount and return it; if no, return NULL. */ -struct page * find_get_page(struct address_space *mapping, unsigned long offset) +struct page * find_get_page(struct address_space *mapping, pgoff_t offset) { struct page *page; @@ -617,30 +659,31 @@ EXPORT_SYMBOL(find_get_page); * Returns zero if the page was not present. find_lock_page() may sleep. */ struct page *find_lock_page(struct address_space *mapping, - unsigned long offset) + pgoff_t offset) { struct page *page; - read_lock_irq(&mapping->tree_lock); repeat: + read_lock_irq(&mapping->tree_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page) { page_cache_get(page); if (TestSetPageLocked(page)) { read_unlock_irq(&mapping->tree_lock); __lock_page(page); - read_lock_irq(&mapping->tree_lock); /* Has the page been truncated while we slept? */ - if (unlikely(page->mapping != mapping || - page->index != offset)) { + if (unlikely(page->mapping != mapping)) { unlock_page(page); page_cache_release(page); goto repeat; } + VM_BUG_ON(page->index != offset); + goto out; } } read_unlock_irq(&mapping->tree_lock); +out: return page; } EXPORT_SYMBOL(find_lock_page); @@ -663,29 +706,24 @@ EXPORT_SYMBOL(find_lock_page); * memory exhaustion. */ struct page *find_or_create_page(struct address_space *mapping, - unsigned long index, gfp_t gfp_mask) + pgoff_t index, gfp_t gfp_mask) { - struct page *page, *cached_page = NULL; + struct page *page; int err; repeat: page = find_lock_page(mapping, index); if (!page) { - if (!cached_page) { - cached_page = - __page_cache_alloc(gfp_mask); - if (!cached_page) - return NULL; + page = __page_cache_alloc(gfp_mask); + if (!page) + return NULL; + err = add_to_page_cache_lru(page, mapping, index, gfp_mask); + if (unlikely(err)) { + page_cache_release(page); + page = NULL; + if (err == -EEXIST) + goto repeat; } - err = add_to_page_cache_lru(cached_page, mapping, - index, gfp_mask); - if (!err) { - page = cached_page; - cached_page = NULL; - } else if (err == -EEXIST) - goto repeat; } - if (cached_page) - page_cache_release(cached_page); return page; } EXPORT_SYMBOL(find_or_create_page); @@ -797,7 +835,7 @@ EXPORT_SYMBOL(find_get_pages_tag); * and deadlock against the caller's locked page. */ struct page * -grab_cache_page_nowait(struct address_space *mapping, unsigned long index) +grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) { struct page *page = find_get_page(mapping, index); @@ -841,9 +879,7 @@ static void shrink_readahead_size_eio(struct file *filp, } /** - * do_generic_mapping_read - generic file read routine - * @mapping: address_space to be read - * @_ra: file's readahead state + * do_generic_file_read - generic file read routine * @filp: the file to read * @ppos: current file position * @desc: read_descriptor @@ -854,30 +890,21 @@ static void shrink_readahead_size_eio(struct file *filp, * * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. - * - * Note the struct file* is only passed for the use of readpage. - * It may be NULL. */ -void do_generic_mapping_read(struct address_space *mapping, - struct file_ra_state *ra, - struct file *filp, - loff_t *ppos, - read_descriptor_t *desc, - read_actor_t actor) +static void do_generic_file_read(struct file *filp, loff_t *ppos, + read_descriptor_t *desc, read_actor_t actor) { + struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; - unsigned long index; - unsigned long offset; - unsigned long last_index; - unsigned long next_index; - unsigned long prev_index; + struct file_ra_state *ra = &filp->f_ra; + pgoff_t index; + pgoff_t last_index; + pgoff_t prev_index; + unsigned long offset; /* offset into pagecache page */ unsigned int prev_offset; - struct page *cached_page; int error; - cached_page = NULL; index = *ppos >> PAGE_CACHE_SHIFT; - next_index = index; prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; @@ -885,7 +912,7 @@ void do_generic_mapping_read(struct address_space *mapping, for (;;) { struct page *page; - unsigned long end_index; + pgoff_t end_index; loff_t isize; unsigned long nr, ret; @@ -973,7 +1000,8 @@ page_ok: page_not_up_to_date: /* Get exclusive access to the page ... */ - lock_page(page); + if (lock_page_killable(page)) + goto readpage_eio; /* Did it get truncated before we got the lock? */ if (!page->mapping) { @@ -1001,7 +1029,8 @@ readpage: } if (!PageUptodate(page)) { - lock_page(page); + if (lock_page_killable(page)) + goto readpage_eio; if (!PageUptodate(page)) { if (page->mapping == NULL) { /* @@ -1012,15 +1041,16 @@ readpage: goto find_page; } unlock_page(page); - error = -EIO; shrink_readahead_size_eio(filp, ra); - goto readpage_error; + goto readpage_eio; } unlock_page(page); } goto page_ok; +readpage_eio: + error = -EIO; readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ desc->error = error; @@ -1032,23 +1062,20 @@ no_cached_page: * Ok, it wasn't cached, so we need to create a new * page.. */ - if (!cached_page) { - cached_page = page_cache_alloc_cold(mapping); - if (!cached_page) { - desc->error = -ENOMEM; - goto out; - } + page = page_cache_alloc_cold(mapping); + if (!page) { + desc->error = -ENOMEM; + goto out; } - error = add_to_page_cache_lru(cached_page, mapping, + error = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); if (error) { + page_cache_release(page); if (error == -EEXIST) goto find_page; desc->error = error; goto out; } - page = cached_page; - cached_page = NULL; goto readpage; } @@ -1058,12 +1085,9 @@ out: ra->prev_pos |= prev_offset; *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; - if (cached_page) - page_cache_release(cached_page); if (filp) file_accessed(filp); } -EXPORT_SYMBOL(do_generic_mapping_read); int file_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset, unsigned long size) @@ -1219,7 +1243,7 @@ EXPORT_SYMBOL(generic_file_aio_read); static ssize_t do_readahead(struct address_space *mapping, struct file *filp, - unsigned long index, unsigned long nr) + pgoff_t index, unsigned long nr) { if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) return -EINVAL; @@ -1239,8 +1263,8 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) if (file) { if (file->f_mode & FMODE_READ) { struct address_space *mapping = file->f_mapping; - unsigned long start = offset >> PAGE_CACHE_SHIFT; - unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT; + pgoff_t start = offset >> PAGE_CACHE_SHIFT; + pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; unsigned long len = end - start + 1; ret = do_readahead(mapping, file, start, len); } @@ -1250,7 +1274,6 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) } #ifdef CONFIG_MMU -static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); /** * page_cache_read - adds requested page to the page cache if not already there * @file: file to read @@ -1259,7 +1282,7 @@ static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. */ -static int fastcall page_cache_read(struct file * file, unsigned long offset) +static int page_cache_read(struct file *file, pgoff_t offset) { struct address_space *mapping = file->f_mapping; struct page *page; @@ -1305,13 +1328,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; struct page *page; - unsigned long size; + pgoff_t size; int did_readaround = 0; int ret = 0; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (vmf->pgoff >= size) - goto outside_data_content; + return VM_FAULT_SIGBUS; /* If we don't want any read-ahead, don't bother */ if (VM_RandomReadHint(vma)) @@ -1388,7 +1411,7 @@ retry_find: if (unlikely(vmf->pgoff >= size)) { unlock_page(page); page_cache_release(page); - goto outside_data_content; + return VM_FAULT_SIGBUS; } /* @@ -1399,15 +1422,6 @@ retry_find: vmf->page = page; return ret | VM_FAULT_LOCKED; -outside_data_content: - /* - * An external ptracer can access pages that normally aren't - * accessible.. - */ - if (vma->vm_mm == current->mm) - return VM_FAULT_SIGBUS; - - /* Fall through to the non-read-ahead case */ no_cached_page: /* * We're only likely to ever get here if MADV_RANDOM is in @@ -1447,6 +1461,11 @@ page_not_uptodate: */ ClearPageError(page); error = mapping->a_ops->readpage(file, page); + if (!error) { + wait_on_page_locked(page); + if (!PageUptodate(page)) + error = -EIO; + } page_cache_release(page); if (!error || error == AOP_TRUNCATED_PAGE) @@ -1500,48 +1519,52 @@ EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); static struct page *__read_cache_page(struct address_space *mapping, - unsigned long index, + pgoff_t index, int (*filler)(void *,struct page*), void *data) { - struct page *page, *cached_page = NULL; + struct page *page; int err; repeat: page = find_get_page(mapping, index); if (!page) { - if (!cached_page) { - cached_page = page_cache_alloc_cold(mapping); - if (!cached_page) - return ERR_PTR(-ENOMEM); - } - err = add_to_page_cache_lru(cached_page, mapping, - index, GFP_KERNEL); - if (err == -EEXIST) - goto repeat; - if (err < 0) { + page = page_cache_alloc_cold(mapping); + if (!page) + return ERR_PTR(-ENOMEM); + err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); + if (unlikely(err)) { + page_cache_release(page); + if (err == -EEXIST) + goto repeat; /* Presumably ENOMEM for radix tree node */ - page_cache_release(cached_page); return ERR_PTR(err); } - page = cached_page; - cached_page = NULL; err = filler(data, page); if (err < 0) { page_cache_release(page); page = ERR_PTR(err); } } - if (cached_page) - page_cache_release(cached_page); return page; } -/* +/** + * read_cache_page_async - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: destination for read data + * * Same as read_cache_page, but don't wait for page to become unlocked * after submitting it to the filler. + * + * Read into the page cache. If a page already exists, and PageUptodate() is + * not set, try to fill the page but don't wait for it to become unlocked. + * + * If the page does not get brought uptodate, return -EIO. */ struct page *read_cache_page_async(struct address_space *mapping, - unsigned long index, + pgoff_t index, int (*filler)(void *,struct page*), void *data) { @@ -1589,7 +1612,7 @@ EXPORT_SYMBOL(read_cache_page_async); * If the page does not get brought uptodate, return -EIO. */ struct page *read_cache_page(struct address_space *mapping, - unsigned long index, + pgoff_t index, int (*filler)(void *,struct page*), void *data) { @@ -1609,40 +1632,6 @@ struct page *read_cache_page(struct address_space *mapping, EXPORT_SYMBOL(read_cache_page); /* - * If the page was newly created, increment its refcount and add it to the - * caller's lru-buffering pagevec. This function is specifically for - * generic_file_write(). - */ -static inline struct page * -__grab_cache_page(struct address_space *mapping, unsigned long index, - struct page **cached_page, struct pagevec *lru_pvec) -{ - int err; - struct page *page; -repeat: - page = find_lock_page(mapping, index); - if (!page) { - if (!*cached_page) { - *cached_page = page_cache_alloc(mapping); - if (!*cached_page) - return NULL; - } - err = add_to_page_cache(*cached_page, mapping, - index, GFP_KERNEL); - if (err == -EEXIST) - goto repeat; - if (err == 0) { - page = *cached_page; - page_cache_get(page); - if (!pagevec_add(lru_pvec, page)) - __pagevec_lru_add(lru_pvec); - *cached_page = NULL; - } - } - return page; -} - -/* * The logic we want is * * if suid or (sgid and xgrp) @@ -1671,7 +1660,7 @@ int should_remove_suid(struct dentry *dentry) } EXPORT_SYMBOL(should_remove_suid); -int __remove_suid(struct dentry *dentry, int kill) +static int __remove_suid(struct dentry *dentry, int kill) { struct iattr newattrs; @@ -1681,17 +1670,22 @@ int __remove_suid(struct dentry *dentry, int kill) int remove_suid(struct dentry *dentry) { - int kill = should_remove_suid(dentry); + int killsuid = should_remove_suid(dentry); + int killpriv = security_inode_need_killpriv(dentry); + int error = 0; - if (unlikely(kill)) - return __remove_suid(dentry, kill); + if (killpriv < 0) + return killpriv; + if (killpriv) + error = security_inode_killpriv(dentry); + if (!error && killsuid) + error = __remove_suid(dentry, killsuid); - return 0; + return error; } EXPORT_SYMBOL(remove_suid); -size_t -__filemap_copy_from_user_iovec_inatomic(char *vaddr, +static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { size_t copied = 0, left = 0; @@ -1714,6 +1708,126 @@ __filemap_copy_from_user_iovec_inatomic(char *vaddr, } /* + * Copy as much as we can into the page and return the number of bytes which + * were sucessfully copied. If a fault is encountered then return the number of + * bytes which were copied. + */ +size_t iov_iter_copy_from_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + BUG_ON(!in_atomic()); + kaddr = kmap_atomic(page, KM_USER0); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user_inatomic_nocache(kaddr + offset, + buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap_atomic(kaddr, KM_USER0); + + return copied; +} +EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); + +/* + * This has the same sideeffects and return value as + * iov_iter_copy_from_user_atomic(). + * The difference is that it attempts to resolve faults. + * Page must not be locked. + */ +size_t iov_iter_copy_from_user(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + kaddr = kmap(page); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user_nocache(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap(page); + return copied; +} +EXPORT_SYMBOL(iov_iter_copy_from_user); + +void iov_iter_advance(struct iov_iter *i, size_t bytes) +{ + BUG_ON(i->count < bytes); + + if (likely(i->nr_segs == 1)) { + i->iov_offset += bytes; + i->count -= bytes; + } else { + const struct iovec *iov = i->iov; + size_t base = i->iov_offset; + + /* + * The !iov->iov_len check ensures we skip over unlikely + * zero-length segments (without overruning the iovec). + */ + while (bytes || unlikely(!iov->iov_len && i->count)) { + int copy; + + copy = min(bytes, iov->iov_len - base); + BUG_ON(!i->count || i->count < copy); + i->count -= copy; + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + base = 0; + } + } + i->iov = iov; + i->iov_offset = base; + } +} +EXPORT_SYMBOL(iov_iter_advance); + +/* + * Fault in the first iovec of the given iov_iter, to a maximum length + * of bytes. Returns 0 on success, or non-zero if the memory could not be + * accessed (ie. because it is an invalid address). + * + * writev-intensive code may want this to prefault several iovecs -- that + * would be possible (callers must not rely on the fact that _only_ the + * first iovec will be faulted with the current implementation). + */ +int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) +{ + char __user *buf = i->iov->iov_base + i->iov_offset; + bytes = min(bytes, i->iov->iov_len - i->iov_offset); + return fault_in_pages_readable(buf, bytes); +} +EXPORT_SYMBOL(iov_iter_fault_in_readable); + +/* + * Return the count of just the current iov_iter segment. + */ +size_t iov_iter_single_seg_count(struct iov_iter *i) +{ + const struct iovec *iov = i->iov; + if (i->nr_segs == 1) + return i->count; + else + return min(i->count, iov->iov_len - i->iov_offset); +} +EXPORT_SYMBOL(iov_iter_single_seg_count); + +/* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. @@ -1795,6 +1909,91 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i } EXPORT_SYMBOL(generic_write_checks); +int pagecache_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + const struct address_space_operations *aops = mapping->a_ops; + + if (aops->write_begin) { + return aops->write_begin(file, mapping, pos, len, flags, + pagep, fsdata); + } else { + int ret; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + struct inode *inode = mapping->host; + struct page *page; +again: + page = __grab_cache_page(mapping, index); + *pagep = page; + if (!page) + return -ENOMEM; + + if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) { + /* + * There is no way to resolve a short write situation + * for a !Uptodate page (except by double copying in + * the caller done by generic_perform_write_2copy). + * + * Instead, we have to bring it uptodate here. + */ + ret = aops->readpage(file, page); + page_cache_release(page); + if (ret) { + if (ret == AOP_TRUNCATED_PAGE) + goto again; + return ret; + } + goto again; + } + + ret = aops->prepare_write(file, page, offset, offset+len); + if (ret) { + unlock_page(page); + page_cache_release(page); + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); + } + return ret; + } +} +EXPORT_SYMBOL(pagecache_write_begin); + +int pagecache_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + const struct address_space_operations *aops = mapping->a_ops; + int ret; + + if (aops->write_end) { + mark_page_accessed(page); + ret = aops->write_end(file, mapping, pos, len, copied, + page, fsdata); + } else { + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + struct inode *inode = mapping->host; + + flush_dcache_page(page); + ret = aops->commit_write(file, page, offset, offset+len); + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + + if (ret < 0) { + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); + } else if (ret > 0) + ret = min_t(size_t, copied, ret); + else + ret = copied; + } + + return ret; +} +EXPORT_SYMBOL(pagecache_write_end); + ssize_t generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long *nr_segs, loff_t pos, loff_t *ppos, @@ -1834,151 +2033,314 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, } EXPORT_SYMBOL(generic_file_direct_write); -ssize_t -generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, loff_t *ppos, - size_t count, ssize_t written) +/* + * Find or create a page at the given pagecache position. Return the locked + * page. This function is specifically for buffered writes. + */ +struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) { - struct file *file = iocb->ki_filp; - struct address_space * mapping = file->f_mapping; - const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; - long status = 0; - struct page *page; - struct page *cached_page = NULL; - size_t bytes; - struct pagevec lru_pvec; - const struct iovec *cur_iov = iov; /* current iovec */ - size_t iov_base = 0; /* offset in the current iovec */ - char __user *buf; - - pagevec_init(&lru_pvec, 0); + int status; + struct page *page; +repeat: + page = find_lock_page(mapping, index); + if (likely(page)) + return page; - /* - * handle partial DIO write. Adjust cur_iov if needed. - */ - if (likely(nr_segs == 1)) - buf = iov->iov_base + written; - else { - filemap_set_next_iovec(&cur_iov, &iov_base, written); - buf = cur_iov->iov_base + iov_base; + page = page_cache_alloc(mapping); + if (!page) + return NULL; + status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); + if (unlikely(status)) { + page_cache_release(page); + if (status == -EEXIST) + goto repeat; + return NULL; } + return page; +} +EXPORT_SYMBOL(__grab_cache_page); + +static ssize_t generic_perform_write_2copy(struct file *file, + struct iov_iter *i, loff_t pos) +{ + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + struct inode *inode = mapping->host; + long status = 0; + ssize_t written = 0; do { - unsigned long index; - unsigned long offset; - size_t copied; + struct page *src_page; + struct page *page; + pgoff_t index; /* Pagecache index for current page */ + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + offset = (pos & (PAGE_CACHE_SIZE - 1)); index = pos >> PAGE_CACHE_SHIFT; - bytes = PAGE_CACHE_SIZE - offset; - - /* Limit the size of the copy to the caller's write size */ - bytes = min(bytes, count); + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_count(i)); - /* We only need to worry about prefaulting when writes are from - * user-space. NFSd uses vfs_writev with several non-aligned - * segments in the vector, and limiting to one segment a time is - * a noticeable performance for re-write + /* + * a non-NULL src_page indicates that we're doing the + * copy via get_user_pages and kmap. */ - if (!segment_eq(get_fs(), KERNEL_DS)) { - /* - * Limit the size of the copy to that of the current - * segment, because fault_in_pages_readable() doesn't - * know how to walk segments. - */ - bytes = min(bytes, cur_iov->iov_len - iov_base); + src_page = NULL; - /* - * Bring in the user page that we will copy from - * _first_. Otherwise there's a nasty deadlock on - * copying from the same page as we're writing to, - * without it being marked up-to-date. - */ - fault_in_pages_readable(buf, bytes); + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + status = -EFAULT; + break; } - page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); + + page = __grab_cache_page(mapping, index); if (!page) { status = -ENOMEM; break; } - if (unlikely(bytes == 0)) { - status = 0; - copied = 0; - goto zero_length_segment; - } + /* + * non-uptodate pages cannot cope with short copies, and we + * cannot take a pagefault with the destination page locked. + * So pin the source page to copy it. + */ + if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) { + unlock_page(page); - status = a_ops->prepare_write(file, page, offset, offset+bytes); - if (unlikely(status)) { - loff_t isize = i_size_read(inode); + src_page = alloc_page(GFP_KERNEL); + if (!src_page) { + page_cache_release(page); + status = -ENOMEM; + break; + } + + /* + * Cannot get_user_pages with a page locked for the + * same reason as we can't take a page fault with a + * page locked (as explained below). + */ + copied = iov_iter_copy_from_user(src_page, i, + offset, bytes); + if (unlikely(copied == 0)) { + status = -EFAULT; + page_cache_release(page); + page_cache_release(src_page); + break; + } + bytes = copied; - if (status != AOP_TRUNCATED_PAGE) + lock_page(page); + /* + * Can't handle the page going uptodate here, because + * that means we would use non-atomic usercopies, which + * zero out the tail of the page, which can cause + * zeroes to become transiently visible. We could just + * use a non-zeroing copy, but the APIs aren't too + * consistent. + */ + if (unlikely(!page->mapping || PageUptodate(page))) { unlock_page(page); - page_cache_release(page); - if (status == AOP_TRUNCATED_PAGE) + page_cache_release(page); + page_cache_release(src_page); continue; + } + } + + status = a_ops->prepare_write(file, page, offset, offset+bytes); + if (unlikely(status)) + goto fs_write_aop_error; + + if (!src_page) { /* - * prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. + * Must not enter the pagefault handler here, because + * we hold the page lock, so we might recursively + * deadlock on the same lock, or get an ABBA deadlock + * against a different lock, or against the mmap_sem + * (which nests outside the page lock). So increment + * preempt count, and use _atomic usercopies. + * + * The page is uptodate so we are OK to encounter a + * short copy: if unmodified parts of the page are + * marked dirty and written out to disk, it doesn't + * really matter. */ - if (pos + bytes > isize) - vmtruncate(inode, isize); - break; + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, + offset, bytes); + pagefault_enable(); + } else { + void *src, *dst; + src = kmap_atomic(src_page, KM_USER0); + dst = kmap_atomic(page, KM_USER1); + memcpy(dst + offset, src + offset, bytes); + kunmap_atomic(dst, KM_USER1); + kunmap_atomic(src, KM_USER0); + copied = bytes; } - if (likely(nr_segs == 1)) - copied = filemap_copy_from_user(page, offset, - buf, bytes); - else - copied = filemap_copy_from_user_iovec(page, offset, - cur_iov, iov_base, bytes); flush_dcache_page(page); + status = a_ops->commit_write(file, page, offset, offset+bytes); - if (status == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - continue; - } -zero_length_segment: - if (likely(copied >= 0)) { - if (!status) - status = copied; - - if (status >= 0) { - written += status; - count -= status; - pos += status; - buf += status; - if (unlikely(nr_segs > 1)) { - filemap_set_next_iovec(&cur_iov, - &iov_base, status); - if (count) - buf = cur_iov->iov_base + - iov_base; - } else { - iov_base += status; - } - } - } - if (unlikely(copied != bytes)) - if (status >= 0) - status = -EFAULT; + if (unlikely(status < 0)) + goto fs_write_aop_error; + if (unlikely(status > 0)) /* filesystem did partial write */ + copied = min_t(size_t, copied, status); + unlock_page(page); mark_page_accessed(page); page_cache_release(page); - if (status < 0) - break; + if (src_page) + page_cache_release(src_page); + + iov_iter_advance(i, copied); + pos += copied; + written += copied; + balance_dirty_pages_ratelimited(mapping); cond_resched(); - } while (count); - *ppos = pos; + continue; - if (cached_page) - page_cache_release(cached_page); +fs_write_aop_error: + unlock_page(page); + page_cache_release(page); + if (src_page) + page_cache_release(src_page); + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + bytes > inode->i_size) + vmtruncate(inode, inode->i_size); + break; + } while (iov_iter_count(i)); + + return written ? written : status; +} + +static ssize_t generic_perform_write(struct file *file, + struct iov_iter *i, loff_t pos) +{ + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + long status = 0; + ssize_t written = 0; + unsigned int flags = 0; /* - * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC + * Copies from kernel address space cannot fail (NFSD is a big user). */ + if (segment_eq(get_fs(), KERNEL_DS)) + flags |= AOP_FLAG_UNINTERRUPTIBLE; + + do { + struct page *page; + pgoff_t index; /* Pagecache index for current page */ + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ + void *fsdata; + + offset = (pos & (PAGE_CACHE_SIZE - 1)); + index = pos >> PAGE_CACHE_SHIFT; + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_count(i)); + +again: + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + status = -EFAULT; + break; + } + + status = a_ops->write_begin(file, mapping, pos, bytes, flags, + &page, &fsdata); + if (unlikely(status)) + break; + + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); + pagefault_enable(); + flush_dcache_page(page); + + status = a_ops->write_end(file, mapping, pos, bytes, copied, + page, fsdata); + if (unlikely(status < 0)) + break; + copied = status; + + cond_resched(); + + iov_iter_advance(i, copied); + if (unlikely(copied == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + pos += copied; + written += copied; + + balance_dirty_pages_ratelimited(mapping); + + } while (iov_iter_count(i)); + + return written ? written : status; +} + +ssize_t +generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, loff_t *ppos, + size_t count, ssize_t written) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + struct inode *inode = mapping->host; + ssize_t status; + struct iov_iter i; + + iov_iter_init(&i, iov, nr_segs, count, written); + if (a_ops->write_begin) + status = generic_perform_write(file, &i, pos); + else + status = generic_perform_write_2copy(file, &i, pos); + if (likely(status >= 0)) { + written += status; + *ppos = pos + status; + + /* + * For now, when the user asks for O_SYNC, we'll actually give + * O_DSYNC + */ if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { if (!a_ops->writepage || !is_sync_kiocb(iocb)) status = generic_osync_inode(inode, mapping, @@ -1994,7 +2356,6 @@ zero_length_segment: if (unlikely(file->f_flags & O_DIRECT) && written) status = filemap_write_and_wait(mapping); - pagevec_lru_add(&lru_pvec); return written ? written : status; } EXPORT_SYMBOL(generic_file_buffered_write); @@ -2193,21 +2554,17 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, } retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); - if (retval) - goto out; /* * Finally, try again to invalidate clean pages which might have been - * faulted in by get_user_pages() if the source of the write was an - * mmap()ed region of the file we're writing. That's a pretty crazy - * thing to do, so we don't support it 100%. If this invalidation - * fails and we have -EIOCBQUEUED we ignore the failure. + * cached by non-direct readahead, or faulted in by get_user_pages() + * if the source of the write was an mmap'ed region of the file + * we're writing. Either one is a pretty crazy thing to do, + * so we don't support it 100%. If this invalidation + * fails, tough, the write still worked... */ if (rw == WRITE && mapping->nrpages) { - int err = invalidate_inode_pages2_range(mapping, - offset >> PAGE_CACHE_SHIFT, end); - if (err && retval >= 0) - retval = err; + invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end); } out: return retval;