X-Git-Url: http://ftp.safe.ca/?p=safe%2Fjmp%2Flinux-2.6;a=blobdiff_plain;f=fs%2Fsplice.c;h=39208663aaf177f56b3e500fd33d8a56a5ecb500;hp=4ee49e86edde5a2272781b6e49e0371b721ceb33;hb=f653398c86a1c104f0992bd788dd4bb065449be4;hpb=8084870854fe181996c4aa4f44cb2fabcebf164c diff --git a/fs/splice.c b/fs/splice.c index 4ee49e8..3920866 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -58,8 +59,9 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, */ wait_on_page_writeback(page); - if (PagePrivate(page)) - try_to_release_page(page, GFP_KERNEL); + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) + goto out_unlock; /* * If we succeeded in removing the mapping, set LRU flag @@ -75,6 +77,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, * Raced with truncate or failed to remove page from current * address space, unlock and return failure. */ +out_unlock: unlock_page(page); return 1; } @@ -179,8 +182,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, do_wakeup = 0; page_nr = 0; - if (pipe->inode) - mutex_lock(&pipe->inode->i_mutex); + pipe_lock(pipe); for (;;) { if (!pipe->readers) { @@ -242,15 +244,13 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, pipe->waiting_writers--; } - if (pipe->inode) { - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&pipe->wait)) - wake_up_interruptible(&pipe->wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - } + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } while (page_nr < spd_pages) @@ -320,7 +320,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, break; error = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL); + mapping_gfp_mask(mapping)); if (unlikely(error)) { page_cache_release(page); if (error == -EEXIST) @@ -370,19 +370,30 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * for an in-flight io page */ if (flags & SPLICE_F_NONBLOCK) { - if (TestSetPageLocked(page)) + if (!trylock_page(page)) { + error = -EAGAIN; break; + } } else lock_page(page); /* - * page was truncated, stop here. if this isn't the - * first page, we'll just complete what we already - * added + * Page was truncated, or invalidated by the + * filesystem. Redo the find/create, but this time the + * page is kept locked, so there's no chance of another + * race with truncate/invalidate. */ if (!page->mapping) { unlock_page(page); - break; + page = find_or_create_page(mapping, index, + mapping_gfp_mask(mapping)); + + if (!page) { + error = -ENOMEM; + break; + } + page_cache_release(pages[page_nr]); + pages[page_nr] = page; } /* * page was already under io and is now done, great @@ -479,9 +490,8 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - ssize_t spliced; - int ret; loff_t isize, left; + int ret; isize = i_size_read(in->f_mapping->host); if (unlikely(*ppos >= isize)) @@ -491,35 +501,139 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, if (unlikely(left < len)) len = left; - ret = 0; - spliced = 0; - while (len && !spliced) { - ret = __generic_file_splice_read(in, ppos, pipe, len, flags); - - if (ret < 0) - break; - else if (!ret) { - if (spliced) - break; - if (flags & SPLICE_F_NONBLOCK) { - ret = -EAGAIN; - break; - } - } - + ret = __generic_file_splice_read(in, ppos, pipe, len, flags); + if (ret > 0) { *ppos += ret; - len -= ret; - spliced += ret; + file_accessed(in); } - if (spliced) - return spliced; - return ret; } - EXPORT_SYMBOL(generic_file_splice_read); +static const struct pipe_buf_operations default_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static ssize_t kernel_readv(struct file *file, const struct iovec *vec, + unsigned long vlen, loff_t offset) +{ + mm_segment_t old_fs; + loff_t pos = offset; + ssize_t res; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos); + set_fs(old_fs); + + return res; +} + +static ssize_t kernel_write(struct file *file, const char *buf, size_t count, + loff_t pos) +{ + mm_segment_t old_fs; + ssize_t res; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = vfs_write(file, (const char __user *)buf, count, &pos); + set_fs(old_fs); + + return res; +} + +ssize_t default_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + unsigned int nr_pages; + unsigned int nr_freed; + size_t offset; + struct page *pages[PIPE_BUFFERS]; + struct partial_page partial[PIPE_BUFFERS]; + struct iovec vec[PIPE_BUFFERS]; + pgoff_t index; + ssize_t res; + size_t this_len; + int error; + int i; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &default_pipe_buf_ops, + .spd_release = spd_release_page, + }; + + index = *ppos >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; + nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) { + struct page *page; + + page = alloc_page(GFP_USER); + error = -ENOMEM; + if (!page) + goto err; + + this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); + vec[i].iov_base = (void __user *) page_address(page); + vec[i].iov_len = this_len; + pages[i] = page; + spd.nr_pages++; + len -= this_len; + offset = 0; + } + + res = kernel_readv(in, vec, spd.nr_pages, *ppos); + if (res < 0) { + error = res; + goto err; + } + + error = 0; + if (!res) + goto err; + + nr_freed = 0; + for (i = 0; i < spd.nr_pages; i++) { + this_len = min_t(size_t, vec[i].iov_len, res); + partial[i].offset = 0; + partial[i].len = this_len; + if (!this_len) { + __free_page(pages[i]); + pages[i] = NULL; + nr_freed++; + } + res -= this_len; + } + spd.nr_pages -= nr_freed; + + res = splice_to_pipe(pipe, &spd); + if (res > 0) + *ppos += res; + + return res; + +err: + for (i = 0; i < spd.nr_pages; i++) + __free_page(pages[i]); + + return error; +} +EXPORT_SYMBOL(default_file_splice_read); + /* * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' * using sendpage(). Return the number of bytes sent. @@ -534,9 +648,11 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, ret = buf->ops->confirm(pipe, buf); if (!ret) { more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; - - ret = file->f_op->sendpage(file, buf->page, buf->offset, - sd->len, &pos, more); + if (file->f_op && file->f_op->sendpage) + ret = file->f_op->sendpage(file, buf->page, buf->offset, + sd->len, &pos, more); + else + ret = -EINVAL; } return ret; @@ -562,8 +678,8 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create * a new page in the output file page cache and fill/dirty that. */ -static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, - struct splice_desc *sd) +int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) { struct file *file = sd->u.file; struct address_space *mapping = file->f_mapping; @@ -607,108 +723,177 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, out: return ret; } +EXPORT_SYMBOL(pipe_to_file); + +static void wakeup_pipe_writers(struct pipe_inode_info *pipe) +{ + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); +} /** - * __splice_from_pipe - splice data from a pipe to given actor + * splice_from_pipe_feed - feed available data from a pipe to a file * @pipe: pipe to splice from * @sd: information to @actor * @actor: handler that splices the data * * Description: - * This function does little more than loop over the pipe and call - * @actor to do the actual moving of a single struct pipe_buffer to - * the desired destination. See pipe_to_file, pipe_to_sendpage, or - * pipe_to_user. + * This function loops over the pipe and calls @actor to do the + * actual moving of a single struct pipe_buffer to the desired + * destination. It returns when there's no more buffers left in + * the pipe or if the requested number of bytes (@sd->total_len) + * have been copied. It returns a positive number (one) if the + * pipe needs to be filled with more data, zero if the required + * number of bytes have been copied and -errno on error. * + * This, together with splice_from_pipe_{begin,end,next}, may be + * used to implement the functionality of __splice_from_pipe() when + * locking is required around copying the pipe buffers to the + * destination. */ -ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, - splice_actor *actor) +int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, + splice_actor *actor) { - int ret, do_wakeup, err; - - ret = 0; - do_wakeup = 0; - - for (;;) { - if (pipe->nrbufs) { - struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; - const struct pipe_buf_operations *ops = buf->ops; + int ret; - sd->len = buf->len; - if (sd->len > sd->total_len) - sd->len = sd->total_len; + while (pipe->nrbufs) { + struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; + const struct pipe_buf_operations *ops = buf->ops; - err = actor(pipe, buf, sd); - if (err <= 0) { - if (!ret && err != -ENODATA) - ret = err; + sd->len = buf->len; + if (sd->len > sd->total_len) + sd->len = sd->total_len; - break; - } + ret = actor(pipe, buf, sd); + if (ret <= 0) { + if (ret == -ENODATA) + ret = 0; + return ret; + } + buf->offset += ret; + buf->len -= ret; - ret += err; - buf->offset += err; - buf->len -= err; + sd->num_spliced += ret; + sd->len -= ret; + sd->pos += ret; + sd->total_len -= ret; - sd->len -= err; - sd->pos += err; - sd->total_len -= err; - if (sd->len) - continue; + if (!buf->len) { + buf->ops = NULL; + ops->release(pipe, buf); + pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); + pipe->nrbufs--; + if (pipe->inode) + sd->need_wakeup = true; + } - if (!buf->len) { - buf->ops = NULL; - ops->release(pipe, buf); - pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); - pipe->nrbufs--; - if (pipe->inode) - do_wakeup = 1; - } + if (!sd->total_len) + return 0; + } - if (!sd->total_len) - break; - } + return 1; +} +EXPORT_SYMBOL(splice_from_pipe_feed); - if (pipe->nrbufs) - continue; +/** + * splice_from_pipe_next - wait for some data to splice from + * @pipe: pipe to splice from + * @sd: information about the splice operation + * + * Description: + * This function will wait for some data and return a positive + * value (one) if pipe buffers are available. It will return zero + * or -errno if no more data needs to be spliced. + */ +int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) +{ + while (!pipe->nrbufs) { if (!pipe->writers) - break; - if (!pipe->waiting_writers) { - if (ret) - break; - } + return 0; - if (sd->flags & SPLICE_F_NONBLOCK) { - if (!ret) - ret = -EAGAIN; - break; - } + if (!pipe->waiting_writers && sd->num_spliced) + return 0; - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; - break; - } + if (sd->flags & SPLICE_F_NONBLOCK) + return -EAGAIN; - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&pipe->wait)) - wake_up_interruptible_sync(&pipe->wait); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - do_wakeup = 0; + if (signal_pending(current)) + return -ERESTARTSYS; + + if (sd->need_wakeup) { + wakeup_pipe_writers(pipe); + sd->need_wakeup = false; } pipe_wait(pipe); } - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&pipe->wait)) - wake_up_interruptible(&pipe->wait); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - } + return 1; +} +EXPORT_SYMBOL(splice_from_pipe_next); - return ret; +/** + * splice_from_pipe_begin - start splicing from pipe + * @sd: information about the splice operation + * + * Description: + * This function should be called before a loop containing + * splice_from_pipe_next() and splice_from_pipe_feed() to + * initialize the necessary fields of @sd. + */ +void splice_from_pipe_begin(struct splice_desc *sd) +{ + sd->num_spliced = 0; + sd->need_wakeup = false; +} +EXPORT_SYMBOL(splice_from_pipe_begin); + +/** + * splice_from_pipe_end - finish splicing from pipe + * @pipe: pipe to splice from + * @sd: information about the splice operation + * + * Description: + * This function will wake up pipe writers if necessary. It should + * be called after a loop containing splice_from_pipe_next() and + * splice_from_pipe_feed(). + */ +void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) +{ + if (sd->need_wakeup) + wakeup_pipe_writers(pipe); +} +EXPORT_SYMBOL(splice_from_pipe_end); + +/** + * __splice_from_pipe - splice data from a pipe to given actor + * @pipe: pipe to splice from + * @sd: information to @actor + * @actor: handler that splices the data + * + * Description: + * This function does little more than loop over the pipe and call + * @actor to do the actual moving of a single struct pipe_buffer to + * the desired destination. See pipe_to_file, pipe_to_sendpage, or + * pipe_to_user. + * + */ +ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, + splice_actor *actor) +{ + int ret; + + splice_from_pipe_begin(sd); + do { + ret = splice_from_pipe_next(pipe, sd); + if (ret > 0) + ret = splice_from_pipe_feed(pipe, sd, actor); + } while (ret > 0); + splice_from_pipe_end(pipe, sd); + + return sd->num_spliced ? sd->num_spliced : ret; } EXPORT_SYMBOL(__splice_from_pipe); @@ -722,7 +907,7 @@ EXPORT_SYMBOL(__splice_from_pipe); * @actor: handler that splices the data * * Description: - * See __splice_from_pipe. This function locks the input and output inodes, + * See __splice_from_pipe. This function locks the pipe inode, * otherwise it's identical to __splice_from_pipe(). * */ @@ -731,7 +916,6 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, splice_actor *actor) { ssize_t ret; - struct inode *inode = out->f_mapping->host; struct splice_desc sd = { .total_len = len, .flags = flags, @@ -739,21 +923,15 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, .u.file = out, }; - /* - * The actor worker might be calling ->prepare_write and - * ->commit_write. Most of the time, these expect i_mutex to - * be held. Since this may result in an ABBA deadlock with - * pipe->inode, we have to order lock acquiry here. - */ - inode_double_lock(inode, pipe->inode); + pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, actor); - inode_double_unlock(inode, pipe->inode); + pipe_unlock(pipe); return ret; } /** - * generic_file_splice_write_nolock - generic_file_splice_write without mutexes + * generic_file_splice_write - splice data from a pipe to a file * @pipe: pipe info * @out: file to write to * @ppos: position in @out @@ -762,13 +940,12 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, * * Description: * Will either move or copy pages (determined by @flags options) from - * the given pipe inode to the given file. The caller is responsible - * for acquiring i_mutex on both inodes. + * the given pipe inode to the given file. * */ ssize_t -generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags) +generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) { struct address_space *mapping = out->f_mapping; struct inode *inode = mapping->host; @@ -779,101 +956,78 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, .u.file = out, }; ssize_t ret; - int err; - err = remove_suid(out->f_path.dentry); - if (unlikely(err)) - return err; + pipe_lock(pipe); + + splice_from_pipe_begin(&sd); + do { + ret = splice_from_pipe_next(pipe, &sd); + if (ret <= 0) + break; + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + ret = file_remove_suid(out); + if (!ret) { + file_update_time(out); + ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file); + } + mutex_unlock(&inode->i_mutex); + } while (ret > 0); + splice_from_pipe_end(pipe, &sd); + + pipe_unlock(pipe); + + if (sd.num_spliced) + ret = sd.num_spliced; - ret = __splice_from_pipe(pipe, &sd, pipe_to_file); if (ret > 0) { unsigned long nr_pages; + int err; - *ppos += ret; nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - /* - * If file or inode is SYNC and we actually wrote some data, - * sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { - err = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - - if (err) - ret = err; - } + err = generic_write_sync(out, *ppos, ret); + if (err) + ret = err; + else + *ppos += ret; balance_dirty_pages_ratelimited_nr(mapping, nr_pages); } return ret; } -EXPORT_SYMBOL(generic_file_splice_write_nolock); +EXPORT_SYMBOL(generic_file_splice_write); -/** - * generic_file_splice_write - splice data from a pipe to a file - * @pipe: pipe info - * @out: file to write to - * @ppos: position in @out - * @len: number of bytes to splice - * @flags: splice modifier flags - * - * Description: - * Will either move or copy pages (determined by @flags options) from - * the given pipe inode to the given file. - * - */ -ssize_t -generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags) +static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) { - struct address_space *mapping = out->f_mapping; - struct inode *inode = mapping->host; - int killsuid, killpriv; - ssize_t ret; - int err = 0; - - killpriv = security_inode_need_killpriv(out->f_path.dentry); - killsuid = should_remove_suid(out->f_path.dentry); - if (unlikely(killsuid || killpriv)) { - mutex_lock(&inode->i_mutex); - if (killpriv) - err = security_inode_killpriv(out->f_path.dentry); - if (!err && killsuid) - err = __remove_suid(out->f_path.dentry, killsuid); - mutex_unlock(&inode->i_mutex); - if (err) - return err; - } + int ret; + void *data; - ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); - if (ret > 0) { - unsigned long nr_pages; + ret = buf->ops->confirm(pipe, buf); + if (ret) + return ret; - *ppos += ret; - nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - - /* - * If file or inode is SYNC and we actually wrote some data, - * sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { - mutex_lock(&inode->i_mutex); - err = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - mutex_unlock(&inode->i_mutex); - - if (err) - ret = err; - } - balance_dirty_pages_ratelimited_nr(mapping, nr_pages); - } + data = buf->ops->map(pipe, buf, 0); + ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos); + buf->ops->unmap(pipe, buf, data); return ret; } -EXPORT_SYMBOL(generic_file_splice_write); +static ssize_t default_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + ssize_t ret; + + ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf); + if (ret > 0) + *ppos += ret; + + return ret; +} /** * generic_splice_sendpage - splice data from a pipe to a socket @@ -902,19 +1056,26 @@ EXPORT_SYMBOL(generic_splice_sendpage); static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, + loff_t *, size_t, unsigned int); int ret; - if (unlikely(!out->f_op || !out->f_op->splice_write)) - return -EINVAL; - if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; + if (unlikely(out->f_flags & O_APPEND)) + return -EINVAL; + ret = rw_verify_area(WRITE, out, ppos, len); if (unlikely(ret < 0)) return ret; - return out->f_op->splice_write(pipe, out, ppos, len, flags); + if (out->f_op && out->f_op->splice_write) + splice_write = out->f_op->splice_write; + else + splice_write = default_file_splice_write; + + return splice_write(pipe, out, ppos, len, flags); } /* @@ -924,11 +1085,10 @@ static long do_splice_to(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { + ssize_t (*splice_read)(struct file *, loff_t *, + struct pipe_inode_info *, size_t, unsigned int); int ret; - if (unlikely(!in->f_op || !in->f_op->splice_read)) - return -EINVAL; - if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; @@ -936,7 +1096,12 @@ static long do_splice_to(struct file *in, loff_t *ppos, if (unlikely(ret < 0)) return ret; - return in->f_op->splice_read(in, ppos, pipe, len, flags); + if (in->f_op && in->f_op->splice_read) + splice_read = in->f_op->splice_read; + else + splice_read = default_file_splice_read; + + return splice_read(in, ppos, pipe, len, flags); } /** @@ -1005,7 +1170,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, while (len) { size_t read_len; - loff_t pos = sd->pos; + loff_t pos = sd->pos, prev_pos = pos; ret = do_splice_to(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) @@ -1020,15 +1185,19 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, * could get stuck data in the internal pipe: */ ret = actor(pipe, sd); - if (unlikely(ret <= 0)) + if (unlikely(ret <= 0)) { + sd->pos = prev_pos; goto out_release; + } bytes += ret; len -= ret; sd->pos = pos; - if (ret < read_len) + if (ret < read_len) { + sd->pos = prev_pos + ret; goto out_release; + } } done: @@ -1094,11 +1263,14 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, ret = splice_direct_to_actor(in, &sd, direct_splice_actor); if (ret > 0) - *ppos += ret; + *ppos = sd.pos; return ret; } +static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags); /* * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same * location, so checking ->i_pipe is not enough to verify that this is a @@ -1119,16 +1291,37 @@ static long do_splice(struct file *in, loff_t __user *off_in, struct file *out, loff_t __user *off_out, size_t len, unsigned int flags) { - struct pipe_inode_info *pipe; + struct pipe_inode_info *ipipe; + struct pipe_inode_info *opipe; loff_t offset, *off; long ret; - pipe = pipe_info(in->f_path.dentry->d_inode); - if (pipe) { + ipipe = pipe_info(in->f_path.dentry->d_inode); + opipe = pipe_info(out->f_path.dentry->d_inode); + + if (ipipe && opipe) { + if (off_in || off_out) + return -ESPIPE; + + if (!(in->f_mode & FMODE_READ)) + return -EBADF; + + if (!(out->f_mode & FMODE_WRITE)) + return -EBADF; + + /* Splicing to self would be fun, but... */ + if (ipipe == opipe) + return -EINVAL; + + return splice_pipe_to_pipe(ipipe, opipe, len, flags); + } + + if (ipipe) { if (off_in) return -ESPIPE; if (off_out) { - if (out->f_op->llseek == no_llseek) + if (!out->f_op || !out->f_op->llseek || + out->f_op->llseek == no_llseek) return -EINVAL; if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; @@ -1136,7 +1329,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, } else off = &out->f_pos; - ret = do_splice_from(pipe, out, off, len, flags); + ret = do_splice_from(ipipe, out, off, len, flags); if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) ret = -EFAULT; @@ -1144,12 +1337,12 @@ static long do_splice(struct file *in, loff_t __user *off_in, return ret; } - pipe = pipe_info(out->f_path.dentry->d_inode); - if (pipe) { + if (opipe) { if (off_out) return -ESPIPE; if (off_in) { - if (in->f_op->llseek == no_llseek) + if (!in->f_op || !in->f_op->llseek || + in->f_op->llseek == no_llseek) return -EINVAL; if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; @@ -1157,7 +1350,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, } else off = &in->f_pos; - ret = do_splice_to(in, off, pipe, len, flags); + ret = do_splice_to(in, off, opipe, len, flags); if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) ret = -EFAULT; @@ -1169,33 +1362,6 @@ static long do_splice(struct file *in, loff_t __user *off_in, } /* - * Do a copy-from-user while holding the mmap_semaphore for reading, in a - * manner safe from deadlocking with simultaneous mmap() (grabbing mmap_sem - * for writing) and page faulting on the user memory pointed to by src. - * This assumes that we will very rarely hit the partial != 0 path, or this - * will not be a win. - */ -static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n) -{ - int partial; - - pagefault_disable(); - partial = __copy_from_user_inatomic(dst, src, n); - pagefault_enable(); - - /* - * Didn't copy everything, drop the mmap_sem and do a faulting copy - */ - if (unlikely(partial)) { - up_read(¤t->mm->mmap_sem); - partial = copy_from_user(dst, src, n); - down_read(¤t->mm->mmap_sem); - } - - return partial; -} - -/* * Map an iov into an array of pages and offset/length tupples. With the * partial_page structure, we can map several non-contiguous ranges into * our ones pages[] map instead of splitting that operation into pieces. @@ -1208,8 +1374,6 @@ static int get_iovec_page_array(const struct iovec __user *iov, { int buffers = 0, error = 0; - down_read(¤t->mm->mmap_sem); - while (nr_vecs) { unsigned long off, npages; struct iovec entry; @@ -1218,7 +1382,7 @@ static int get_iovec_page_array(const struct iovec __user *iov, int i; error = -EFAULT; - if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry))) + if (copy_from_user(&entry, iov, sizeof(entry))) break; base = entry.iov_base; @@ -1231,7 +1395,7 @@ static int get_iovec_page_array(const struct iovec __user *iov, if (unlikely(!len)) break; error = -EFAULT; - if (unlikely(!base)) + if (!access_ok(VERIFY_READ, base, len)) break; /* @@ -1252,9 +1416,8 @@ static int get_iovec_page_array(const struct iovec __user *iov, if (npages > PIPE_BUFFERS - buffers) npages = PIPE_BUFFERS - buffers; - error = get_user_pages(current, current->mm, - (unsigned long) base, npages, 0, 0, - &pages[buffers], NULL); + error = get_user_pages_fast((unsigned long)base, npages, + 0, &pages[buffers]); if (unlikely(error <= 0)) break; @@ -1293,8 +1456,6 @@ static int get_iovec_page_array(const struct iovec __user *iov, iov++; } - up_read(¤t->mm->mmap_sem); - if (buffers) return buffers; @@ -1359,8 +1520,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, if (!pipe) return -EBADF; - if (pipe->inode) - mutex_lock(&pipe->inode->i_mutex); + pipe_lock(pipe); error = ret = 0; while (nr_segs) { @@ -1387,6 +1547,11 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, break; } + if (unlikely(!access_ok(VERIFY_WRITE, base, len))) { + error = -EFAULT; + break; + } + sd.len = 0; sd.total_len = len; sd.flags = flags; @@ -1410,8 +1575,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, iov++; } - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); if (!ret) ret = error; @@ -1466,8 +1630,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, * Currently we punt and implement it as a normal copy, see pipe_to_user(). * */ -asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, - unsigned long nr_segs, unsigned int flags) +SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, + unsigned long, nr_segs, unsigned int, flags) { struct file *file; long error; @@ -1492,9 +1656,9 @@ asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, return error; } -asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, - int fd_out, loff_t __user *off_out, - size_t len, unsigned int flags) +SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, + int, fd_out, loff_t __user *, off_out, + size_t, len, unsigned int, flags) { long error; struct file *in, *out; @@ -1527,7 +1691,7 @@ asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, * Make sure there's data to read. Wait for input if we can, otherwise * return an appropriate error. */ -static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; @@ -1539,7 +1703,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) return 0; ret = 0; - mutex_lock(&pipe->inode->i_mutex); + pipe_lock(pipe); while (!pipe->nrbufs) { if (signal_pending(current)) { @@ -1557,7 +1721,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) pipe_wait(pipe); } - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); return ret; } @@ -1565,7 +1729,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) * Make sure there's writeable room. Wait for room if we can, otherwise * return an appropriate error. */ -static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; @@ -1577,7 +1741,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) return 0; ret = 0; - mutex_lock(&pipe->inode->i_mutex); + pipe_lock(pipe); while (pipe->nrbufs >= PIPE_BUFFERS) { if (!pipe->readers) { @@ -1598,7 +1762,125 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) pipe->waiting_writers--; } - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); + return ret; +} + +/* + * Splice contents of ipipe to opipe. + */ +static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) +{ + struct pipe_buffer *ibuf, *obuf; + int ret = 0, nbuf; + bool input_wakeup = false; + + +retry: + ret = ipipe_prep(ipipe, flags); + if (ret) + return ret; + + ret = opipe_prep(opipe, flags); + if (ret) + return ret; + + /* + * Potential ABBA deadlock, work around it by ordering lock + * grabbing by pipe info address. Otherwise two different processes + * could deadlock (one doing tee from A -> B, the other from B -> A). + */ + pipe_double_lock(ipipe, opipe); + + do { + if (!opipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + + if (!ipipe->nrbufs && !ipipe->writers) + break; + + /* + * Cannot make any progress, because either the input + * pipe is empty or the output pipe is full. + */ + if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) { + /* Already processed some buffers, break */ + if (ret) + break; + + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + + /* + * We raced with another reader/writer and haven't + * managed to process any buffers. A zero return + * value means EOF, so retry instead. + */ + pipe_unlock(ipipe); + pipe_unlock(opipe); + goto retry; + } + + ibuf = ipipe->bufs + ipipe->curbuf; + nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS; + obuf = opipe->bufs + nbuf; + + if (len >= ibuf->len) { + /* + * Simply move the whole buffer from ipipe to opipe + */ + *obuf = *ibuf; + ibuf->ops = NULL; + opipe->nrbufs++; + ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS; + ipipe->nrbufs--; + input_wakeup = true; + } else { + /* + * Get a reference to this pipe buffer, + * so we can copy the contents over. + */ + ibuf->ops->get(ipipe, ibuf); + *obuf = *ibuf; + + /* + * Don't inherit the gift flag, we need to + * prevent multiple steals of this page. + */ + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + + obuf->len = len; + opipe->nrbufs++; + ibuf->offset += obuf->len; + ibuf->len -= obuf->len; + } + ret += obuf->len; + len -= obuf->len; + } while (len); + + pipe_unlock(ipipe); + pipe_unlock(opipe); + + /* + * If we put data in the output pipe, wakeup any potential readers. + */ + if (ret > 0) { + smp_mb(); + if (waitqueue_active(&opipe->wait)) + wake_up_interruptible(&opipe->wait); + kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); + } + if (input_wakeup) + wakeup_pipe_writers(ipipe); + return ret; } @@ -1614,10 +1896,10 @@ static int link_pipe(struct pipe_inode_info *ipipe, /* * Potential ABBA deadlock, work around it by ordering lock - * grabbing by inode address. Otherwise two different processes + * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ - inode_double_lock(ipipe->inode, opipe->inode); + pipe_double_lock(ipipe, opipe); do { if (!opipe->readers) { @@ -1661,7 +1943,15 @@ static int link_pipe(struct pipe_inode_info *ipipe, i++; } while (len); - inode_double_unlock(ipipe->inode, opipe->inode); + /* + * return EAGAIN if we have the potential of some data in the + * future, otherwise just return 0 + */ + if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) + ret = -EAGAIN; + + pipe_unlock(ipipe); + pipe_unlock(opipe); /* * If we put data in the output pipe, wakeup any potential readers. @@ -1698,21 +1988,18 @@ static long do_tee(struct file *in, struct file *out, size_t len, * Keep going, unless we encounter an error. The ipipe/opipe * ordering doesn't really matter. */ - ret = link_ipipe_prep(ipipe, flags); + ret = ipipe_prep(ipipe, flags); if (!ret) { - ret = link_opipe_prep(opipe, flags); - if (!ret) { + ret = opipe_prep(opipe, flags); + if (!ret) ret = link_pipe(ipipe, opipe, len, flags); - if (!ret && (flags & SPLICE_F_NONBLOCK)) - ret = -EAGAIN; - } } } return ret; } -asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) +SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { struct file *in; int error, fput_in;