X-Git-Url: http://ftp.safe.ca/?p=safe%2Fjmp%2Flinux-2.6;a=blobdiff_plain;f=fs%2Fsplice.c;h=6bdcb6107bc3c55b58199537e2eb5537133cd460;hp=5428b0ff3b6fcfebe6abd074cb9719ec48c9dbda;hb=b53767719b6cd8789392ea3e7e2eb7b8906898f0;hpb=d9993c37ef87c758d4a6e63972395b1cf8a4cb7b diff --git a/fs/splice.c b/fs/splice.c index 5428b0f..6bdcb61 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -28,22 +28,7 @@ #include #include #include - -struct partial_page { - unsigned int offset; - unsigned int len; -}; - -/* - * Passed to splice_to_pipe - */ -struct splice_pipe_desc { - struct page **pages; /* page map */ - struct partial_page *partial; /* pages[] may not be contig */ - int nr_pages; /* number of pages in map */ - unsigned int flags; /* splice flags */ - const struct pipe_buf_operations *ops;/* ops associated with output pipe */ -}; +#include /* * Attempt to steal a page from a pipe buffer. This should perhaps go into @@ -101,8 +86,12 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, buf->flags &= ~PIPE_BUF_FLAG_LRU; } -static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +/* + * Check whether the contents of buf is OK to access. Since the content + * is a page cache page, IO may be in flight. + */ +static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { struct page *page = buf->page; int err; @@ -143,7 +132,7 @@ static const struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, - .pin = page_cache_pipe_buf_pin, + .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = page_cache_pipe_buf_steal, .get = generic_pipe_buf_get, @@ -163,19 +152,27 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, - .pin = generic_pipe_buf_pin, + .confirm = generic_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = user_page_pipe_buf_steal, .get = generic_pipe_buf_get, }; -/* - * Pipe output worker. This sets up our pipe format with the page cache - * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). +/** + * splice_to_pipe - fill passed data into a pipe + * @pipe: pipe to fill + * @spd: data to fill + * + * Description: + * @spd contains a map of pages and len/offset tuples, along with + * the struct pipe_buf_operations associated with these pages. This + * function will link that data to the pipe. + * */ -static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, - struct splice_pipe_desc *spd) +ssize_t splice_to_pipe(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) { + unsigned int spd_pages = spd->nr_pages; int ret, do_wakeup, page_nr; ret = 0; @@ -200,6 +197,7 @@ static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; buf->len = spd->partial[page_nr].len; + buf->private = spd->partial[page_nr].private; buf->ops = spd->ops; if (spd->flags & SPLICE_F_GIFT) buf->flags |= PIPE_BUF_FLAG_GIFT; @@ -244,17 +242,18 @@ static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, pipe->waiting_writers--; } - if (pipe->inode) + if (pipe->inode) { mutex_unlock(&pipe->inode->i_mutex); - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&pipe->wait)) - wake_up_interruptible(&pipe->wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + } } - while (page_nr < spd->nr_pages) + while (page_nr < spd_pages) page_cache_release(spd->pages[page_nr++]); return ret; @@ -266,13 +265,12 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, unsigned int flags) { struct address_space *mapping = in->f_mapping; - unsigned int loff, nr_pages; + unsigned int loff, nr_pages, req_pages; struct page *pages[PIPE_BUFFERS]; struct partial_page partial[PIPE_BUFFERS]; struct page *page; pgoff_t index, end_index; loff_t isize; - size_t total_len; int error, page_nr; struct splice_pipe_desc spd = { .pages = pages, @@ -283,35 +281,24 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, index = *ppos >> PAGE_CACHE_SHIFT; loff = *ppos & ~PAGE_CACHE_MASK; - nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - - if (nr_pages > PIPE_BUFFERS) - nr_pages = PIPE_BUFFERS; - - /* - * Initiate read-ahead on this page range. however, don't call into - * read-ahead if this is a non-zero offset (we are likely doing small - * chunk splice and the page is already there) for a single page. - */ - if (!loff || nr_pages > 1) - page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); - - /* - * Now fill in the holes: - */ - error = 0; - total_len = 0; + req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); /* * Lookup the (hopefully) full range of pages we need. */ spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); + index += spd.nr_pages; /* * If find_get_pages_contig() returned fewer pages than we needed, - * allocate the rest. + * readahead/allocate the rest and fill in the holes. */ - index += spd.nr_pages; + if (spd.nr_pages < nr_pages) + page_cache_sync_readahead(mapping, &in->f_ra, in, + index, req_pages - spd.nr_pages); + + error = 0; while (spd.nr_pages < nr_pages) { /* * Page could be there, find_get_pages_contig() breaks on @@ -320,12 +307,6 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, page = find_get_page(mapping, index); if (!page) { /* - * Make sure the read-ahead engine is notified - * about this failure. - */ - handle_ra_miss(mapping, &in->f_ra, index); - - /* * page didn't exist, allocate one. */ page = page_cache_alloc_cold(mapping); @@ -370,6 +351,10 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); page = pages[page_nr]; + if (PageReadahead(page)) + page_cache_async_readahead(mapping, &in->f_ra, in, + page, index, req_pages - page_nr); + /* * If the page isn't uptodate, we may need to start io on it */ @@ -378,10 +363,11 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * If in nonblock mode then dont block on waiting * for an in-flight io page */ - if (flags & SPLICE_F_NONBLOCK) - break; - - lock_page(page); + if (flags & SPLICE_F_NONBLOCK) { + if (TestSetPageLocked(page)) + break; + } else + lock_page(page); /* * page was truncated, stop here. if this isn't the @@ -416,47 +402,52 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, break; } + } +fill_it: + /* + * i_size must be checked after PageUptodate. + */ + isize = i_size_read(mapping->host); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) + break; + + /* + * if this is the last page, see if we need to shrink + * the length and stop + */ + if (end_index == index) { + unsigned int plen; /* - * i_size must be checked after ->readpage(). + * max good bytes in this page */ - isize = i_size_read(mapping->host); - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(!isize || index > end_index)) + plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + if (plen <= loff) break; /* - * if this is the last page, see if we need to shrink - * the length and stop + * force quit after adding this page */ - if (end_index == index) { - loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); - if (total_len + loff > isize) - break; - /* - * force quit after adding this page - */ - len = this_len; - this_len = min(this_len, loff); - loff = 0; - } + this_len = min(this_len, plen - loff); + len = this_len; } -fill_it: + partial[page_nr].offset = loff; partial[page_nr].len = this_len; len -= this_len; - total_len += this_len; loff = 0; spd.nr_pages++; index++; } /* - * Release any pages at the end, if we quit early. 'i' is how far + * Release any pages at the end, if we quit early. 'page_nr' is how far * we got, 'nr_pages' is how many pages are in the map. */ while (page_nr < nr_pages) page_cache_release(pages[page_nr++]); + in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; if (spd.nr_pages) return splice_to_pipe(pipe, &spd); @@ -467,11 +458,16 @@ fill_it: /** * generic_file_splice_read - splice data from file to a pipe * @in: file to splice from + * @ppos: position in @in * @pipe: pipe to splice to * @len: number of bytes to splice * @flags: splice modifier flags * - * Will read pages from given file and fill them into a pipe. + * Description: + * Will read pages from given file and fill them into a pipe. Can be + * used as long as the address_space operations for the source implements + * a readpage() hook. + * */ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, @@ -479,11 +475,19 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, { ssize_t spliced; int ret; + loff_t isize, left; + + isize = i_size_read(in->f_mapping->host); + if (unlikely(*ppos >= isize)) + return 0; + + left = isize - *ppos; + if (unlikely(left < len)) + len = left; ret = 0; spliced = 0; - - while (len) { + while (len && !spliced) { ret = __generic_file_splice_read(in, ppos, pipe, len, flags); if (ret < 0) @@ -517,11 +521,11 @@ EXPORT_SYMBOL(generic_file_splice_read); static int pipe_to_sendpage(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { - struct file *file = sd->file; + struct file *file = sd->u.file; loff_t pos = sd->pos; int ret, more; - ret = buf->ops->pin(pipe, buf); + ret = buf->ops->confirm(pipe, buf); if (!ret) { more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; @@ -555,63 +559,30 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { - struct file *file = sd->file; + struct file *file = sd->u.file; struct address_space *mapping = file->f_mapping; unsigned int offset, this_len; struct page *page; - pgoff_t index; + void *fsdata; int ret; /* * make sure the data in this buffer is uptodate */ - ret = buf->ops->pin(pipe, buf); + ret = buf->ops->confirm(pipe, buf); if (unlikely(ret)) return ret; - index = sd->pos >> PAGE_CACHE_SHIFT; offset = sd->pos & ~PAGE_CACHE_MASK; this_len = sd->len; if (this_len + offset > PAGE_CACHE_SIZE) this_len = PAGE_CACHE_SIZE - offset; -find_page: - page = find_lock_page(mapping, index); - if (!page) { - ret = -ENOMEM; - page = page_cache_alloc_cold(mapping); - if (unlikely(!page)) - goto out_ret; - - /* - * This will also lock the page - */ - ret = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL); - if (unlikely(ret)) - goto out; - } - - ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); - if (unlikely(ret)) { - loff_t isize = i_size_read(mapping->host); - - if (ret != AOP_TRUNCATED_PAGE) - unlock_page(page); - page_cache_release(page); - if (ret == AOP_TRUNCATED_PAGE) - goto find_page; - - /* - * prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. - */ - if (sd->pos + this_len > isize) - vmtruncate(mapping->host, isize); - - goto out_ret; - } + ret = pagecache_write_begin(file, mapping, sd->pos, this_len, + AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); + if (unlikely(ret)) + goto out; if (buf->page != page) { /* @@ -625,64 +596,43 @@ find_page: kunmap_atomic(dst, KM_USER1); buf->ops->unmap(pipe, buf, src); } - - ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); - if (ret) { - if (ret == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto find_page; - } - if (ret < 0) - goto out; - /* - * Partial write has happened, so 'ret' already initialized by - * number of bytes written, Where is nothing we have to do here. - */ - } else - ret = this_len; - /* - * Return the number of bytes written and mark page as - * accessed, we are now done! - */ - mark_page_accessed(page); - balance_dirty_pages_ratelimited(mapping); + ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len, + page, fsdata); out: - page_cache_release(page); - unlock_page(page); -out_ret: return ret; } -/* - * Pipe input worker. Most of this logic works like a regular pipe, the - * key here is the 'actor' worker passed in that actually moves the data - * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. +/** + * __splice_from_pipe - splice data from a pipe to given actor + * @pipe: pipe to splice from + * @sd: information to @actor + * @actor: handler that splices the data + * + * Description: + * This function does little more than loop over the pipe and call + * @actor to do the actual moving of a single struct pipe_buffer to + * the desired destination. See pipe_to_file, pipe_to_sendpage, or + * pipe_to_user. + * */ -ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, - struct file *out, loff_t *ppos, size_t len, - unsigned int flags, splice_actor *actor) +ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, + splice_actor *actor) { int ret, do_wakeup, err; - struct splice_desc sd; ret = 0; do_wakeup = 0; - sd.total_len = len; - sd.flags = flags; - sd.file = out; - sd.pos = *ppos; - for (;;) { if (pipe->nrbufs) { struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; const struct pipe_buf_operations *ops = buf->ops; - sd.len = buf->len; - if (sd.len > sd.total_len) - sd.len = sd.total_len; + sd->len = buf->len; + if (sd->len > sd->total_len) + sd->len = sd->total_len; - err = actor(pipe, buf, &sd); + err = actor(pipe, buf, sd); if (err <= 0) { if (!ret && err != -ENODATA) ret = err; @@ -694,10 +644,10 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, buf->offset += err; buf->len -= err; - sd.len -= err; - sd.pos += err; - sd.total_len -= err; - if (sd.len) + sd->len -= err; + sd->pos += err; + sd->total_len -= err; + if (sd->len) continue; if (!buf->len) { @@ -709,7 +659,7 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, do_wakeup = 1; } - if (!sd.total_len) + if (!sd->total_len) break; } @@ -722,7 +672,7 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, break; } - if (flags & SPLICE_F_NONBLOCK) { + if (sd->flags & SPLICE_F_NONBLOCK) { if (!ret) ret = -EAGAIN; break; @@ -756,12 +706,32 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, } EXPORT_SYMBOL(__splice_from_pipe); +/** + * splice_from_pipe - splice data from a pipe to a file + * @pipe: pipe to splice from + * @out: file to splice to + * @ppos: position in @out + * @len: how many bytes to splice + * @flags: splice modifier flags + * @actor: handler that splices the data + * + * Description: + * See __splice_from_pipe. This function locks the input and output inodes, + * otherwise it's identical to __splice_from_pipe(). + * + */ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags, splice_actor *actor) { ssize_t ret; struct inode *inode = out->f_mapping->host; + struct splice_desc sd = { + .total_len = len, + .flags = flags, + .pos = *ppos, + .u.file = out, + }; /* * The actor worker might be calling ->prepare_write and @@ -770,7 +740,7 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, * pipe->inode, we have to order lock acquiry here. */ inode_double_lock(inode, pipe->inode); - ret = __splice_from_pipe(pipe, out, ppos, len, flags, actor); + ret = __splice_from_pipe(pipe, &sd, actor); inode_double_unlock(inode, pipe->inode); return ret; @@ -780,12 +750,14 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, * generic_file_splice_write_nolock - generic_file_splice_write without mutexes * @pipe: pipe info * @out: file to write to + * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * - * Will either move or copy pages (determined by @flags options) from - * the given pipe inode to the given file. The caller is responsible - * for acquiring i_mutex on both inodes. + * Description: + * Will either move or copy pages (determined by @flags options) from + * the given pipe inode to the given file. The caller is responsible + * for acquiring i_mutex on both inodes. * */ ssize_t @@ -794,6 +766,12 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, { struct address_space *mapping = out->f_mapping; struct inode *inode = mapping->host; + struct splice_desc sd = { + .total_len = len, + .flags = flags, + .pos = *ppos, + .u.file = out, + }; ssize_t ret; int err; @@ -801,9 +779,12 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, if (unlikely(err)) return err; - ret = __splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); + ret = __splice_from_pipe(pipe, &sd, pipe_to_file); if (ret > 0) { + unsigned long nr_pages; + *ppos += ret; + nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; /* * If file or inode is SYNC and we actually wrote some data, @@ -816,6 +797,7 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, if (err) ret = err; } + balance_dirty_pages_ratelimited_nr(mapping, nr_pages); } return ret; @@ -827,11 +809,13 @@ EXPORT_SYMBOL(generic_file_splice_write_nolock); * generic_file_splice_write - splice data from a pipe to a file * @pipe: pipe info * @out: file to write to + * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * - * Will either move or copy pages (determined by @flags options) from - * the given pipe inode to the given file. + * Description: + * Will either move or copy pages (determined by @flags options) from + * the given pipe inode to the given file. * */ ssize_t @@ -840,13 +824,18 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, { struct address_space *mapping = out->f_mapping; struct inode *inode = mapping->host; + int killsuid, killpriv; ssize_t ret; - int err; + int err = 0; - err = should_remove_suid(out->f_path.dentry); - if (unlikely(err)) { + killpriv = security_inode_need_killpriv(out->f_path.dentry); + killsuid = should_remove_suid(out->f_path.dentry); + if (unlikely(killsuid || killpriv)) { mutex_lock(&inode->i_mutex); - err = __remove_suid(out->f_path.dentry, err); + if (killpriv) + err = security_inode_killpriv(out->f_path.dentry); + if (!err && killsuid) + err = __remove_suid(out->f_path.dentry, killsuid); mutex_unlock(&inode->i_mutex); if (err) return err; @@ -854,7 +843,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); if (ret > 0) { + unsigned long nr_pages; + *ppos += ret; + nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; /* * If file or inode is SYNC and we actually wrote some data, @@ -869,6 +861,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, if (err) ret = err; } + balance_dirty_pages_ratelimited_nr(mapping, nr_pages); } return ret; @@ -878,13 +871,15 @@ EXPORT_SYMBOL(generic_file_splice_write); /** * generic_splice_sendpage - splice data from a pipe to a socket - * @inode: pipe inode + * @pipe: pipe to splice from * @out: socket to write to + * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * - * Will send @len bytes from the pipe to a network socket. No data copying - * is involved. + * Description: + * Will send @len bytes from the pipe to a network socket. No data copying + * is involved. * */ ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, @@ -913,6 +908,10 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (unlikely(ret < 0)) return ret; + ret = security_file_permission(out, MAY_WRITE); + if (unlikely(ret < 0)) + return ret; + return out->f_op->splice_write(pipe, out, ppos, len, flags); } @@ -923,7 +922,6 @@ static long do_splice_to(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - loff_t isize, left; int ret; if (unlikely(!in->f_op || !in->f_op->splice_read)) @@ -936,25 +934,34 @@ static long do_splice_to(struct file *in, loff_t *ppos, if (unlikely(ret < 0)) return ret; - isize = i_size_read(in->f_mapping->host); - if (unlikely(*ppos >= isize)) - return 0; - - left = isize - *ppos; - if (unlikely(left < len)) - len = left; + ret = security_file_permission(in, MAY_READ); + if (unlikely(ret < 0)) + return ret; return in->f_op->splice_read(in, ppos, pipe, len, flags); } -long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - size_t len, unsigned int flags) +/** + * splice_direct_to_actor - splices data directly between two non-pipes + * @in: file to splice from + * @sd: actor information on where to splice to + * @actor: handles the data splicing + * + * Description: + * This is a special case helper to splice directly between two + * points, without requiring an explicit pipe. Internally an allocated + * pipe is cached in the process, and reused during the lifetime of + * that process. + * + */ +ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, + splice_direct_actor *actor) { struct pipe_inode_info *pipe; long ret, bytes; - loff_t out_off; umode_t i_mode; - int i; + size_t len; + int i, flags; /* * We require the input being a regular file, as we don't want to @@ -990,49 +997,43 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, */ ret = 0; bytes = 0; - out_off = 0; + len = sd->total_len; + flags = sd->flags; - while (len) { - size_t read_len, max_read_len; + /* + * Don't block on output, we have to drain the direct pipe. + */ + sd->flags &= ~SPLICE_F_NONBLOCK; - /* - * Do at most PIPE_BUFFERS pages worth of transfer: - */ - max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); + while (len) { + size_t read_len; + loff_t pos = sd->pos; - ret = do_splice_to(in, ppos, pipe, max_read_len, flags); - if (unlikely(ret < 0)) + ret = do_splice_to(in, &pos, pipe, len, flags); + if (unlikely(ret <= 0)) goto out_release; read_len = ret; + sd->total_len = read_len; /* * NOTE: nonblocking mode only applies to the input. We * must not do the output in nonblocking mode as then we * could get stuck data in the internal pipe: */ - ret = do_splice_from(pipe, out, &out_off, read_len, - flags & ~SPLICE_F_NONBLOCK); - if (unlikely(ret < 0)) + ret = actor(pipe, sd); + if (unlikely(ret <= 0)) goto out_release; bytes += ret; len -= ret; + sd->pos = pos; - /* - * In nonblocking mode, if we got back a short read then - * that was due to either an IO error or due to the - * pagecache entry not being there. In the IO error case - * the _next_ splice attempt will produce a clean IO error - * return value (not a short read), so in both cases it's - * correct to break out of the loop here: - */ - if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len)) - break; + if (ret < read_len) + goto out_release; } pipe->nrbufs = pipe->curbuf = 0; - return bytes; out_release: @@ -1057,9 +1058,51 @@ out_release: return bytes; return ret; + } +EXPORT_SYMBOL(splice_direct_to_actor); -EXPORT_SYMBOL(do_splice_direct); +static int direct_splice_actor(struct pipe_inode_info *pipe, + struct splice_desc *sd) +{ + struct file *file = sd->u.file; + + return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); +} + +/** + * do_splice_direct - splices data directly between two files + * @in: file to splice from + * @ppos: input file offset + * @out: file to splice to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * For use by do_sendfile(). splice can easily emulate sendfile, but + * doing it in the application would incur an extra system call + * (splice in + splice out, as compared to just sendfile()). So this helper + * can splice directly through a process-private pipe. + * + */ +long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + size_t len, unsigned int flags) +{ + struct splice_desc sd = { + .len = len, + .total_len = len, + .flags = flags, + .pos = *ppos, + .u.file = out, + }; + long ret; + + ret = splice_direct_to_actor(in, &sd, direct_splice_actor); + if (ret > 0) + *ppos += ret; + + return ret; +} /* * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same @@ -1131,6 +1174,33 @@ static long do_splice(struct file *in, loff_t __user *off_in, } /* + * Do a copy-from-user while holding the mmap_semaphore for reading, in a + * manner safe from deadlocking with simultaneous mmap() (grabbing mmap_sem + * for writing) and page faulting on the user memory pointed to by src. + * This assumes that we will very rarely hit the partial != 0 path, or this + * will not be a win. + */ +static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n) +{ + int partial; + + pagefault_disable(); + partial = __copy_from_user_inatomic(dst, src, n); + pagefault_enable(); + + /* + * Didn't copy everything, drop the mmap_sem and do a faulting copy + */ + if (unlikely(partial)) { + up_read(¤t->mm->mmap_sem); + partial = copy_from_user(dst, src, n); + down_read(¤t->mm->mmap_sem); + } + + return partial; +} + +/* * Map an iov into an array of pages and offset/length tupples. With the * partial_page structure, we can map several non-contiguous ranges into * our ones pages[] map instead of splitting that operation into pieces. @@ -1143,31 +1213,26 @@ static int get_iovec_page_array(const struct iovec __user *iov, { int buffers = 0, error = 0; - /* - * It's ok to take the mmap_sem for reading, even - * across a "get_user()". - */ down_read(¤t->mm->mmap_sem); while (nr_vecs) { unsigned long off, npages; + struct iovec entry; void __user *base; size_t len; int i; - /* - * Get user address base and length for this iovec. - */ - error = get_user(base, &iov->iov_base); - if (unlikely(error)) - break; - error = get_user(len, &iov->iov_len); - if (unlikely(error)) + error = -EFAULT; + if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry))) break; + base = entry.iov_base; + len = entry.iov_len; + /* * Sanity check this iovec. 0 read succeeds. */ + error = 0; if (unlikely(!len)) break; error = -EFAULT; @@ -1241,28 +1306,131 @@ static int get_iovec_page_array(const struct iovec __user *iov, return error; } +static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) +{ + char *src; + int ret; + + ret = buf->ops->confirm(pipe, buf); + if (unlikely(ret)) + return ret; + + /* + * See if we can use the atomic maps, by prefaulting in the + * pages and doing an atomic copy + */ + if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { + src = buf->ops->map(pipe, buf, 1); + ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, + sd->len); + buf->ops->unmap(pipe, buf, src); + if (!ret) { + ret = sd->len; + goto out; + } + } + + /* + * No dice, use slow non-atomic map and copy + */ + src = buf->ops->map(pipe, buf, 0); + + ret = sd->len; + if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) + ret = -EFAULT; + + buf->ops->unmap(pipe, buf, src); +out: + if (ret > 0) + sd->u.userptr += ret; + return ret; +} + +/* + * For lack of a better implementation, implement vmsplice() to userspace + * as a simple copy of the pipes pages to the user iov. + */ +static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) +{ + struct pipe_inode_info *pipe; + struct splice_desc sd; + ssize_t size; + int error; + long ret; + + pipe = pipe_info(file->f_path.dentry->d_inode); + if (!pipe) + return -EBADF; + + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); + + error = ret = 0; + while (nr_segs) { + void __user *base; + size_t len; + + /* + * Get user address base and length for this iovec. + */ + error = get_user(base, &iov->iov_base); + if (unlikely(error)) + break; + error = get_user(len, &iov->iov_len); + if (unlikely(error)) + break; + + /* + * Sanity check this iovec. 0 read succeeds. + */ + if (unlikely(!len)) + break; + if (unlikely(!base)) { + error = -EFAULT; + break; + } + + sd.len = 0; + sd.total_len = len; + sd.flags = flags; + sd.u.userptr = base; + sd.pos = 0; + + size = __splice_from_pipe(pipe, &sd, pipe_to_user); + if (size < 0) { + if (!ret) + ret = size; + + break; + } + + ret += size; + + if (size < len) + break; + + nr_segs--; + iov++; + } + + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + + if (!ret) + ret = error; + + return ret; +} + /* * vmsplice splices a user address range into a pipe. It can be thought of * as splice-from-memory, where the regular splice is splice-from-file (or * to file). In both cases the output is a pipe, naturally. - * - * Note that vmsplice only supports splicing _from_ user memory to a pipe, - * not the other way around. Splicing from user memory is a simple operation - * that can be supported without any funky alignment restrictions or nasty - * vm tricks. We simply map in the user memory and fill them into a pipe. - * The reverse isn't quite as easy, though. There are two possible solutions - * for that: - * - * - memcpy() the data internally, at which point we might as well just - * do a regular read() on the buffer anyway. - * - Lots of nasty vm tricks, that are neither fast nor flexible (it - * has restriction limitations on both ends of the pipe). - * - * Alas, it isn't here. - * */ -static long do_vmsplice(struct file *file, const struct iovec __user *iov, - unsigned long nr_segs, unsigned int flags) +static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) { struct pipe_inode_info *pipe; struct page *pages[PIPE_BUFFERS]; @@ -1277,10 +1445,6 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov, pipe = pipe_info(file->f_path.dentry->d_inode); if (!pipe) return -EBADF; - if (unlikely(nr_segs > UIO_MAXIOV)) - return -EINVAL; - else if (unlikely(!nr_segs)) - return 0; spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, flags & SPLICE_F_GIFT); @@ -1290,6 +1454,22 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov, return splice_to_pipe(pipe, &spd); } +/* + * Note that vmsplice only really supports true splicing _from_ user memory + * to a pipe, not the other way around. Splicing from user memory is a simple + * operation that can be supported without any funky alignment restrictions + * or nasty vm tricks. We simply map in the user memory and fill them into + * a pipe. The reverse isn't quite as easy, though. There are two possible + * solutions for that: + * + * - memcpy() the data internally, at which point we might as well just + * do a regular read() on the buffer anyway. + * - Lots of nasty vm tricks, that are neither fast nor flexible (it + * has restriction limitations on both ends of the pipe). + * + * Currently we punt and implement it as a normal copy, see pipe_to_user(). + * + */ asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, unsigned long nr_segs, unsigned int flags) { @@ -1297,11 +1477,18 @@ asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, long error; int fput; + if (unlikely(nr_segs > UIO_MAXIOV)) + return -EINVAL; + else if (unlikely(!nr_segs)) + return 0; + error = -EBADF; file = fget_light(fd, &fput); if (file) { if (file->f_mode & FMODE_WRITE) - error = do_vmsplice(file, iov, nr_segs, flags); + error = vmsplice_to_pipe(file, iov, nr_segs, flags); + else if (file->f_mode & FMODE_READ) + error = vmsplice_to_user(file, iov, nr_segs, flags); fput_light(file, fput); }