#include <linux/module.h>
#include <linux/syscalls.h>
#include <linux/uio.h>
+#include <linux/security.h>
/*
* Attempt to steal a page from a pipe buffer. This should perhaps go into
buf->flags &= ~PIPE_BUF_FLAG_LRU;
}
-static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
+/*
+ * Check whether the contents of buf is OK to access. Since the content
+ * is a page cache page, IO may be in flight.
+ */
+static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
{
struct page *page = buf->page;
int err;
.can_merge = 0,
.map = generic_pipe_buf_map,
.unmap = generic_pipe_buf_unmap,
- .pin = page_cache_pipe_buf_pin,
+ .confirm = page_cache_pipe_buf_confirm,
.release = page_cache_pipe_buf_release,
.steal = page_cache_pipe_buf_steal,
.get = generic_pipe_buf_get,
.can_merge = 0,
.map = generic_pipe_buf_map,
.unmap = generic_pipe_buf_unmap,
- .pin = generic_pipe_buf_pin,
+ .confirm = generic_pipe_buf_confirm,
.release = page_cache_pipe_buf_release,
.steal = user_page_pipe_buf_steal,
.get = generic_pipe_buf_get,
};
-/*
- * Pipe output worker. This fills a pipe with the information contained
- * from splice_pipe_desc().
+/**
+ * splice_to_pipe - fill passed data into a pipe
+ * @pipe: pipe to fill
+ * @spd: data to fill
+ *
+ * Description:
+ * @spd contains a map of pages and len/offset tuples, along with
+ * the struct pipe_buf_operations associated with these pages. This
+ * function will link that data to the pipe.
+ *
*/
ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
struct splice_pipe_desc *spd)
buf->page = spd->pages[page_nr];
buf->offset = spd->partial[page_nr].offset;
buf->len = spd->partial[page_nr].len;
+ buf->private = spd->partial[page_nr].private;
buf->ops = spd->ops;
if (spd->flags & SPLICE_F_GIFT)
buf->flags |= PIPE_BUF_FLAG_GIFT;
}
while (page_nr < spd_pages)
- page_cache_release(spd->pages[page_nr++]);
+ spd->spd_release(spd, page_nr++);
return ret;
}
+static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
+{
+ page_cache_release(spd->pages[i]);
+}
+
static int
__generic_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
struct address_space *mapping = in->f_mapping;
- unsigned int loff, nr_pages;
+ unsigned int loff, nr_pages, req_pages;
struct page *pages[PIPE_BUFFERS];
struct partial_page partial[PIPE_BUFFERS];
struct page *page;
.partial = partial,
.flags = flags,
.ops = &page_cache_pipe_buf_ops,
+ .spd_release = spd_release_page,
};
index = *ppos >> PAGE_CACHE_SHIFT;
loff = *ppos & ~PAGE_CACHE_MASK;
- nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-
- if (nr_pages > PIPE_BUFFERS)
- nr_pages = PIPE_BUFFERS;
-
- /*
- * Don't try to 2nd guess the read-ahead logic, call into
- * page_cache_readahead() like the page cache reads would do.
- */
- page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);
-
- /*
- * Now fill in the holes:
- */
- error = 0;
+ req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
/*
* Lookup the (hopefully) full range of pages we need.
*/
spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
+ index += spd.nr_pages;
/*
* If find_get_pages_contig() returned fewer pages than we needed,
- * allocate the rest.
+ * readahead/allocate the rest and fill in the holes.
*/
- index += spd.nr_pages;
+ if (spd.nr_pages < nr_pages)
+ page_cache_sync_readahead(mapping, &in->f_ra, in,
+ index, req_pages - spd.nr_pages);
+
+ error = 0;
while (spd.nr_pages < nr_pages) {
/*
* Page could be there, find_get_pages_contig() breaks on
page = find_get_page(mapping, index);
if (!page) {
/*
- * Make sure the read-ahead engine is notified
- * about this failure.
- */
- handle_ra_miss(mapping, &in->f_ra, index);
-
- /*
* page didn't exist, allocate one.
*/
page = page_cache_alloc_cold(mapping);
break;
error = add_to_page_cache_lru(page, mapping, index,
- GFP_KERNEL);
+ mapping_gfp_mask(mapping));
if (unlikely(error)) {
page_cache_release(page);
if (error == -EEXIST)
this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
page = pages[page_nr];
+ if (PageReadahead(page))
+ page_cache_async_readahead(mapping, &in->f_ra, in,
+ page, index, req_pages - page_nr);
+
/*
* If the page isn't uptodate, we may need to start io on it
*/
* for an in-flight io page
*/
if (flags & SPLICE_F_NONBLOCK) {
- if (TestSetPageLocked(page))
+ if (TestSetPageLocked(page)) {
+ error = -EAGAIN;
break;
+ }
} else
lock_page(page);
*/
while (page_nr < nr_pages)
page_cache_release(pages[page_nr++]);
+ in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
if (spd.nr_pages)
return splice_to_pipe(pipe, &spd);
/**
* generic_file_splice_read - splice data from file to a pipe
* @in: file to splice from
+ * @ppos: position in @in
* @pipe: pipe to splice to
* @len: number of bytes to splice
* @flags: splice modifier flags
*
- * Will read pages from given file and fill them into a pipe.
+ * Description:
+ * Will read pages from given file and fill them into a pipe. Can be
+ * used as long as the address_space operations for the source implements
+ * a readpage() hook.
+ *
*/
ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
- ssize_t spliced;
- int ret;
loff_t isize, left;
+ int ret;
isize = i_size_read(in->f_mapping->host);
if (unlikely(*ppos >= isize))
if (unlikely(left < len))
len = left;
- ret = 0;
- spliced = 0;
- while (len) {
- ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
-
- if (ret < 0)
- break;
- else if (!ret) {
- if (spliced)
- break;
- if (flags & SPLICE_F_NONBLOCK) {
- ret = -EAGAIN;
- break;
- }
- }
-
+ ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
+ if (ret > 0)
*ppos += ret;
- len -= ret;
- spliced += ret;
- }
-
- if (spliced)
- return spliced;
return ret;
}
loff_t pos = sd->pos;
int ret, more;
- ret = buf->ops->pin(pipe, buf);
+ ret = buf->ops->confirm(pipe, buf);
if (!ret) {
more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
struct address_space *mapping = file->f_mapping;
unsigned int offset, this_len;
struct page *page;
- pgoff_t index;
+ void *fsdata;
int ret;
/*
* make sure the data in this buffer is uptodate
*/
- ret = buf->ops->pin(pipe, buf);
+ ret = buf->ops->confirm(pipe, buf);
if (unlikely(ret))
return ret;
- index = sd->pos >> PAGE_CACHE_SHIFT;
offset = sd->pos & ~PAGE_CACHE_MASK;
this_len = sd->len;
if (this_len + offset > PAGE_CACHE_SIZE)
this_len = PAGE_CACHE_SIZE - offset;
-find_page:
- page = find_lock_page(mapping, index);
- if (!page) {
- ret = -ENOMEM;
- page = page_cache_alloc_cold(mapping);
- if (unlikely(!page))
- goto out_ret;
-
- /*
- * This will also lock the page
- */
- ret = add_to_page_cache_lru(page, mapping, index,
- GFP_KERNEL);
- if (unlikely(ret))
- goto out;
- }
-
- ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
- if (unlikely(ret)) {
- loff_t isize = i_size_read(mapping->host);
-
- if (ret != AOP_TRUNCATED_PAGE)
- unlock_page(page);
- page_cache_release(page);
- if (ret == AOP_TRUNCATED_PAGE)
- goto find_page;
-
- /*
- * prepare_write() may have instantiated a few blocks
- * outside i_size. Trim these off again.
- */
- if (sd->pos + this_len > isize)
- vmtruncate(mapping->host, isize);
-
- goto out_ret;
- }
+ ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
+ AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+ if (unlikely(ret))
+ goto out;
if (buf->page != page) {
/*
kunmap_atomic(dst, KM_USER1);
buf->ops->unmap(pipe, buf, src);
}
-
- ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
- if (ret) {
- if (ret == AOP_TRUNCATED_PAGE) {
- page_cache_release(page);
- goto find_page;
- }
- if (ret < 0)
- goto out;
- /*
- * Partial write has happened, so 'ret' already initialized by
- * number of bytes written, Where is nothing we have to do here.
- */
- } else
- ret = this_len;
- /*
- * Return the number of bytes written and mark page as
- * accessed, we are now done!
- */
- mark_page_accessed(page);
+ ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
+ page, fsdata);
out:
- page_cache_release(page);
- unlock_page(page);
-out_ret:
return ret;
}
-/*
- * Pipe input worker. Most of this logic works like a regular pipe, the
- * key here is the 'actor' worker passed in that actually moves the data
- * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
+/**
+ * __splice_from_pipe - splice data from a pipe to given actor
+ * @pipe: pipe to splice from
+ * @sd: information to @actor
+ * @actor: handler that splices the data
+ *
+ * Description:
+ * This function does little more than loop over the pipe and call
+ * @actor to do the actual moving of a single struct pipe_buffer to
+ * the desired destination. See pipe_to_file, pipe_to_sendpage, or
+ * pipe_to_user.
+ *
*/
ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
splice_actor *actor)
}
EXPORT_SYMBOL(__splice_from_pipe);
+/**
+ * splice_from_pipe - splice data from a pipe to a file
+ * @pipe: pipe to splice from
+ * @out: file to splice to
+ * @ppos: position in @out
+ * @len: how many bytes to splice
+ * @flags: splice modifier flags
+ * @actor: handler that splices the data
+ *
+ * Description:
+ * See __splice_from_pipe. This function locks the input and output inodes,
+ * otherwise it's identical to __splice_from_pipe().
+ *
+ */
ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
loff_t *ppos, size_t len, unsigned int flags,
splice_actor *actor)
* generic_file_splice_write_nolock - generic_file_splice_write without mutexes
* @pipe: pipe info
* @out: file to write to
+ * @ppos: position in @out
* @len: number of bytes to splice
* @flags: splice modifier flags
*
- * Will either move or copy pages (determined by @flags options) from
- * the given pipe inode to the given file. The caller is responsible
- * for acquiring i_mutex on both inodes.
+ * Description:
+ * Will either move or copy pages (determined by @flags options) from
+ * the given pipe inode to the given file. The caller is responsible
+ * for acquiring i_mutex on both inodes.
*
*/
ssize_t
* generic_file_splice_write - splice data from a pipe to a file
* @pipe: pipe info
* @out: file to write to
+ * @ppos: position in @out
* @len: number of bytes to splice
* @flags: splice modifier flags
*
- * Will either move or copy pages (determined by @flags options) from
- * the given pipe inode to the given file.
+ * Description:
+ * Will either move or copy pages (determined by @flags options) from
+ * the given pipe inode to the given file.
*
*/
ssize_t
{
struct address_space *mapping = out->f_mapping;
struct inode *inode = mapping->host;
+ struct splice_desc sd = {
+ .total_len = len,
+ .flags = flags,
+ .pos = *ppos,
+ .u.file = out,
+ };
ssize_t ret;
- int err;
-
- err = should_remove_suid(out->f_path.dentry);
- if (unlikely(err)) {
- mutex_lock(&inode->i_mutex);
- err = __remove_suid(out->f_path.dentry, err);
- mutex_unlock(&inode->i_mutex);
- if (err)
- return err;
- }
- ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
+ inode_double_lock(inode, pipe->inode);
+ ret = remove_suid(out->f_path.dentry);
+ if (likely(!ret))
+ ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
+ inode_double_unlock(inode, pipe->inode);
if (ret > 0) {
unsigned long nr_pages;
* sync it.
*/
if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ int err;
+
mutex_lock(&inode->i_mutex);
err = generic_osync_inode(inode, mapping,
OSYNC_METADATA|OSYNC_DATA);
/**
* generic_splice_sendpage - splice data from a pipe to a socket
- * @inode: pipe inode
+ * @pipe: pipe to splice from
* @out: socket to write to
+ * @ppos: position in @out
* @len: number of bytes to splice
* @flags: splice modifier flags
*
- * Will send @len bytes from the pipe to a network socket. No data copying
- * is involved.
+ * Description:
+ * Will send @len bytes from the pipe to a network socket. No data copying
+ * is involved.
*
*/
ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
return in->f_op->splice_read(in, ppos, pipe, len, flags);
}
-/*
- * Splices from an input file to an actor, using a 'direct' pipe.
+/**
+ * splice_direct_to_actor - splices data directly between two non-pipes
+ * @in: file to splice from
+ * @sd: actor information on where to splice to
+ * @actor: handles the data splicing
+ *
+ * Description:
+ * This is a special case helper to splice directly between two
+ * points, without requiring an explicit pipe. Internally an allocated
+ * pipe is cached in the process, and reused during the lifetime of
+ * that process.
+ *
*/
ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
splice_direct_actor *actor)
sd->flags &= ~SPLICE_F_NONBLOCK;
while (len) {
- size_t read_len, max_read_len;
-
- /*
- * Do at most PIPE_BUFFERS pages worth of transfer:
- */
- max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
+ size_t read_len;
+ loff_t pos = sd->pos;
- ret = do_splice_to(in, &sd->pos, pipe, max_read_len, flags);
- if (unlikely(ret < 0))
+ ret = do_splice_to(in, &pos, pipe, len, flags);
+ if (unlikely(ret <= 0))
goto out_release;
read_len = ret;
* could get stuck data in the internal pipe:
*/
ret = actor(pipe, sd);
- if (unlikely(ret < 0))
+ if (unlikely(ret <= 0))
goto out_release;
bytes += ret;
len -= ret;
+ sd->pos = pos;
- /*
- * In nonblocking mode, if we got back a short read then
- * that was due to either an IO error or due to the
- * pagecache entry not being there. In the IO error case
- * the _next_ splice attempt will produce a clean IO error
- * return value (not a short read), so in both cases it's
- * correct to break out of the loop here:
- */
- if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len))
- break;
+ if (ret < read_len)
+ goto out_release;
}
+done:
pipe->nrbufs = pipe->curbuf = 0;
-
+ file_accessed(in);
return bytes;
out_release:
buf->ops = NULL;
}
}
- pipe->nrbufs = pipe->curbuf = 0;
- /*
- * If we transferred some data, return the number of bytes:
- */
- if (bytes > 0)
- return bytes;
-
- return ret;
+ if (!bytes)
+ bytes = ret;
+ goto done;
}
EXPORT_SYMBOL(splice_direct_to_actor);
return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);
}
+/**
+ * do_splice_direct - splices data directly between two files
+ * @in: file to splice from
+ * @ppos: input file offset
+ * @out: file to splice to
+ * @len: number of bytes to splice
+ * @flags: splice modifier flags
+ *
+ * Description:
+ * For use by do_sendfile(). splice can easily emulate sendfile, but
+ * doing it in the application would incur an extra system call
+ * (splice in + splice out, as compared to just sendfile()). So this helper
+ * can splice directly through a process-private pipe.
+ *
+ */
long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
size_t len, unsigned int flags)
{
.pos = *ppos,
.u.file = out,
};
- size_t ret;
+ long ret;
ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
- *ppos = sd.pos;
+ if (ret > 0)
+ *ppos += ret;
+
return ret;
}
}
/*
+ * Do a copy-from-user while holding the mmap_semaphore for reading, in a
+ * manner safe from deadlocking with simultaneous mmap() (grabbing mmap_sem
+ * for writing) and page faulting on the user memory pointed to by src.
+ * This assumes that we will very rarely hit the partial != 0 path, or this
+ * will not be a win.
+ */
+static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n)
+{
+ int partial;
+
+ if (!access_ok(VERIFY_READ, src, n))
+ return -EFAULT;
+
+ pagefault_disable();
+ partial = __copy_from_user_inatomic(dst, src, n);
+ pagefault_enable();
+
+ /*
+ * Didn't copy everything, drop the mmap_sem and do a faulting copy
+ */
+ if (unlikely(partial)) {
+ up_read(¤t->mm->mmap_sem);
+ partial = copy_from_user(dst, src, n);
+ down_read(¤t->mm->mmap_sem);
+ }
+
+ return partial;
+}
+
+/*
* Map an iov into an array of pages and offset/length tupples. With the
* partial_page structure, we can map several non-contiguous ranges into
* our ones pages[] map instead of splitting that operation into pieces.
{
int buffers = 0, error = 0;
- /*
- * It's ok to take the mmap_sem for reading, even
- * across a "get_user()".
- */
down_read(¤t->mm->mmap_sem);
while (nr_vecs) {
unsigned long off, npages;
+ struct iovec entry;
void __user *base;
size_t len;
int i;
- /*
- * Get user address base and length for this iovec.
- */
- error = get_user(base, &iov->iov_base);
- if (unlikely(error))
- break;
- error = get_user(len, &iov->iov_len);
- if (unlikely(error))
+ error = -EFAULT;
+ if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry)))
break;
+ base = entry.iov_base;
+ len = entry.iov_len;
+
/*
* Sanity check this iovec. 0 read succeeds.
*/
+ error = 0;
if (unlikely(!len))
break;
error = -EFAULT;
- if (unlikely(!base))
+ if (!access_ok(VERIFY_READ, base, len))
break;
/*
char *src;
int ret;
- ret = buf->ops->pin(pipe, buf);
+ ret = buf->ops->confirm(pipe, buf);
if (unlikely(ret))
return ret;
if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
ret = -EFAULT;
+ buf->ops->unmap(pipe, buf, src);
out:
if (ret > 0)
sd->u.userptr += ret;
- buf->ops->unmap(pipe, buf, src);
return ret;
}
break;
}
+ if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
+ error = -EFAULT;
+ break;
+ }
+
sd.len = 0;
sd.total_len = len;
sd.flags = flags;
.partial = partial,
.flags = flags,
.ops = &user_page_pipe_buf_ops,
+ .spd_release = spd_release_page,
};
pipe = pipe_info(file->f_path.dentry->d_inode);
i++;
} while (len);
+ /*
+ * return EAGAIN if we have the potential of some data in the
+ * future, otherwise just return 0
+ */
+ if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
+ ret = -EAGAIN;
+
inode_double_unlock(ipipe->inode, opipe->inode);
/*
ret = link_ipipe_prep(ipipe, flags);
if (!ret) {
ret = link_opipe_prep(opipe, flags);
- if (!ret) {
+ if (!ret)
ret = link_pipe(ipipe, opipe, len, flags);
- if (!ret && (flags & SPLICE_F_NONBLOCK))
- ret = -EAGAIN;
- }
}
}