7c2bbf18d7a7c3622679bbf6e6a7f0bc0a78c7fa
[safe/jmp/linux-2.6] / fs / splice.c
1 /*
2  * "splice": joining two ropes together by interweaving their strands.
3  *
4  * This is the "extended pipe" functionality, where a pipe is used as
5  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6  * buffer that you can use to transfer data from one end to the other.
7  *
8  * The traditional unix read/write is extended with a "splice()" operation
9  * that transfers data buffers to or from a pipe buffer.
10  *
11  * Named by Larry McVoy, original implementation from Linus, extended by
12  * Jens to support splicing to files and fixing the initial implementation
13  * bugs.
14  *
15  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
16  * Copyright (C) 2005 Linus Torvalds <torvalds@osdl.org>
17  *
18  */
19 #include <linux/fs.h>
20 #include <linux/file.h>
21 #include <linux/pagemap.h>
22 #include <linux/pipe_fs_i.h>
23 #include <linux/mm_inline.h>
24 #include <linux/swap.h>
25 #include <linux/module.h>
26
27 /*
28  * Passed to the actors
29  */
30 struct splice_desc {
31         unsigned int len, total_len;    /* current and remaining length */
32         unsigned int flags;             /* splice flags */
33         struct file *file;              /* file to read/write */
34         loff_t pos;                     /* file position */
35 };
36
37 static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
38                                      struct pipe_buffer *buf)
39 {
40         struct page *page = buf->page;
41
42         WARN_ON(!PageLocked(page));
43         WARN_ON(!PageUptodate(page));
44
45         if (!remove_mapping(page_mapping(page), page))
46                 return 1;
47
48         if (PageLRU(page)) {
49                 struct zone *zone = page_zone(page);
50
51                 spin_lock_irq(&zone->lru_lock);
52                 BUG_ON(!PageLRU(page));
53                 __ClearPageLRU(page);
54                 del_page_from_lru(zone, page);
55                 spin_unlock_irq(&zone->lru_lock);
56         }
57
58         buf->stolen = 1;
59         return 0;
60 }
61
62 static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
63                                         struct pipe_buffer *buf)
64 {
65         page_cache_release(buf->page);
66         buf->page = NULL;
67         buf->stolen = 0;
68 }
69
70 static void *page_cache_pipe_buf_map(struct file *file,
71                                      struct pipe_inode_info *info,
72                                      struct pipe_buffer *buf)
73 {
74         struct page *page = buf->page;
75
76         lock_page(page);
77
78         if (!PageUptodate(page)) {
79                 unlock_page(page);
80                 return ERR_PTR(-EIO);
81         }
82
83         if (!page->mapping) {
84                 unlock_page(page);
85                 return ERR_PTR(-ENODATA);
86         }
87
88         return kmap(buf->page);
89 }
90
91 static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
92                                       struct pipe_buffer *buf)
93 {
94         if (!buf->stolen)
95                 unlock_page(buf->page);
96         kunmap(buf->page);
97 }
98
99 static struct pipe_buf_operations page_cache_pipe_buf_ops = {
100         .can_merge = 0,
101         .map = page_cache_pipe_buf_map,
102         .unmap = page_cache_pipe_buf_unmap,
103         .release = page_cache_pipe_buf_release,
104         .steal = page_cache_pipe_buf_steal,
105 };
106
107 static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
108                             int nr_pages, unsigned long offset,
109                             unsigned long len)
110 {
111         struct pipe_inode_info *info;
112         int ret, do_wakeup, i;
113
114         ret = 0;
115         do_wakeup = 0;
116         i = 0;
117
118         mutex_lock(PIPE_MUTEX(*inode));
119
120         info = inode->i_pipe;
121         for (;;) {
122                 int bufs;
123
124                 if (!PIPE_READERS(*inode)) {
125                         send_sig(SIGPIPE, current, 0);
126                         if (!ret)
127                                 ret = -EPIPE;
128                         break;
129                 }
130
131                 bufs = info->nrbufs;
132                 if (bufs < PIPE_BUFFERS) {
133                         int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1);
134                         struct pipe_buffer *buf = info->bufs + newbuf;
135                         struct page *page = pages[i++];
136                         unsigned long this_len;
137
138                         this_len = PAGE_CACHE_SIZE - offset;
139                         if (this_len > len)
140                                 this_len = len;
141
142                         buf->page = page;
143                         buf->offset = offset;
144                         buf->len = this_len;
145                         buf->ops = &page_cache_pipe_buf_ops;
146                         info->nrbufs = ++bufs;
147                         do_wakeup = 1;
148
149                         ret += this_len;
150                         len -= this_len;
151                         offset = 0;
152                         if (!--nr_pages)
153                                 break;
154                         if (!len)
155                                 break;
156                         if (bufs < PIPE_BUFFERS)
157                                 continue;
158
159                         break;
160                 }
161
162                 if (signal_pending(current)) {
163                         if (!ret)
164                                 ret = -ERESTARTSYS;
165                         break;
166                 }
167
168                 if (do_wakeup) {
169                         wake_up_interruptible_sync(PIPE_WAIT(*inode));
170                         kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO,
171                                     POLL_IN);
172                         do_wakeup = 0;
173                 }
174
175                 PIPE_WAITING_WRITERS(*inode)++;
176                 pipe_wait(inode);
177                 PIPE_WAITING_WRITERS(*inode)--;
178         }
179
180         mutex_unlock(PIPE_MUTEX(*inode));
181
182         if (do_wakeup) {
183                 wake_up_interruptible(PIPE_WAIT(*inode));
184                 kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
185         }
186
187         while (i < nr_pages)
188                 page_cache_release(pages[i++]);
189
190         return ret;
191 }
192
193 static int __generic_file_splice_read(struct file *in, struct inode *pipe,
194                                       size_t len)
195 {
196         struct address_space *mapping = in->f_mapping;
197         unsigned int offset, nr_pages;
198         struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS];
199         struct page *page;
200         pgoff_t index, pidx;
201         int i, j;
202
203         index = in->f_pos >> PAGE_CACHE_SHIFT;
204         offset = in->f_pos & ~PAGE_CACHE_MASK;
205         nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
206
207         if (nr_pages > PIPE_BUFFERS)
208                 nr_pages = PIPE_BUFFERS;
209
210         /*
211          * initiate read-ahead on this page range
212          */
213         do_page_cache_readahead(mapping, in, index, nr_pages);
214
215         /*
216          * Get as many pages from the page cache as possible..
217          * Start IO on the page cache entries we create (we
218          * can assume that any pre-existing ones we find have
219          * already had IO started on them).
220          */
221         i = find_get_pages(mapping, index, nr_pages, pages);
222
223         /*
224          * common case - we found all pages and they are contiguous,
225          * kick them off
226          */
227         if (i && (pages[i - 1]->index == index + i - 1))
228                 goto splice_them;
229
230         /*
231          * fill shadow[] with pages at the right locations, so we only
232          * have to fill holes
233          */
234         memset(shadow, 0, i * sizeof(struct page *));
235         for (j = 0, pidx = index; j < i; pidx++, j++)
236                 shadow[pages[j]->index - pidx] = pages[j];
237
238         /*
239          * now fill in the holes
240          */
241         for (i = 0, pidx = index; i < nr_pages; pidx++, i++) {
242                 int error;
243
244                 if (shadow[i])
245                         continue;
246
247                 /*
248                  * no page there, look one up / create it
249                  */
250                 page = find_or_create_page(mapping, pidx,
251                                                    mapping_gfp_mask(mapping));
252                 if (!page)
253                         break;
254
255                 if (PageUptodate(page))
256                         unlock_page(page);
257                 else {
258                         error = mapping->a_ops->readpage(in, page);
259
260                         if (unlikely(error)) {
261                                 page_cache_release(page);
262                                 break;
263                         }
264                 }
265                 shadow[i] = page;
266         }
267
268         if (!i) {
269                 for (i = 0; i < nr_pages; i++) {
270                          if (shadow[i])
271                                 page_cache_release(shadow[i]);
272                 }
273                 return 0;
274         }
275
276         memcpy(pages, shadow, i * sizeof(struct page *));
277
278         /*
279          * Now we splice them into the pipe..
280          */
281 splice_them:
282         return move_to_pipe(pipe, pages, i, offset, len);
283 }
284
285 ssize_t generic_file_splice_read(struct file *in, struct inode *pipe,
286                                  size_t len, unsigned int flags)
287 {
288         ssize_t spliced;
289         int ret;
290
291         ret = 0;
292         spliced = 0;
293         while (len) {
294                 ret = __generic_file_splice_read(in, pipe, len);
295
296                 if (ret <= 0)
297                         break;
298
299                 in->f_pos += ret;
300                 len -= ret;
301                 spliced += ret;
302         }
303
304         if (spliced)
305                 return spliced;
306
307         return ret;
308 }
309
310 /*
311  * Send 'len' bytes to socket from 'file' at position 'pos' using sendpage().
312  */
313 static int pipe_to_sendpage(struct pipe_inode_info *info,
314                             struct pipe_buffer *buf, struct splice_desc *sd)
315 {
316         struct file *file = sd->file;
317         loff_t pos = sd->pos;
318         unsigned int offset;
319         ssize_t ret;
320         void *ptr;
321
322         /*
323          * sub-optimal, but we are limited by the pipe ->map. we don't
324          * need a kmap'ed buffer here, we just want to make sure we
325          * have the page pinned if the pipe page originates from the
326          * page cache
327          */
328         ptr = buf->ops->map(file, info, buf);
329         if (IS_ERR(ptr))
330                 return PTR_ERR(ptr);
331
332         offset = pos & ~PAGE_CACHE_MASK;
333
334         ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,
335                                         sd->len < sd->total_len);
336
337         buf->ops->unmap(info, buf);
338         if (ret == sd->len)
339                 return 0;
340
341         return -EIO;
342 }
343
344 /*
345  * This is a little more tricky than the file -> pipe splicing. There are
346  * basically three cases:
347  *
348  *      - Destination page already exists in the address space and there
349  *        are users of it. For that case we have no other option that
350  *        copying the data. Tough luck.
351  *      - Destination page already exists in the address space, but there
352  *        are no users of it. Make sure it's uptodate, then drop it. Fall
353  *        through to last case.
354  *      - Destination page does not exist, we can add the pipe page to
355  *        the page cache and avoid the copy.
356  *
357  * For now we just do the slower thing and always copy pages over, it's
358  * easier than migrating pages from the pipe to the target file. For the
359  * case of doing file | file splicing, the migrate approach had some LRU
360  * nastiness...
361  */
362 static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
363                         struct splice_desc *sd)
364 {
365         struct file *file = sd->file;
366         struct address_space *mapping = file->f_mapping;
367         unsigned int offset;
368         struct page *page;
369         pgoff_t index;
370         char *src;
371         int ret;
372
373         /*
374          * after this, page will be locked and unmapped
375          */
376         src = buf->ops->map(file, info, buf);
377         if (IS_ERR(src))
378                 return PTR_ERR(src);
379
380         index = sd->pos >> PAGE_CACHE_SHIFT;
381         offset = sd->pos & ~PAGE_CACHE_MASK;
382
383         /*
384          * reuse buf page, if SPLICE_F_MOVE is set
385          */
386         if (sd->flags & SPLICE_F_MOVE) {
387                 if (buf->ops->steal(info, buf))
388                         goto find_page;
389
390                 page = buf->page;
391                 if (add_to_page_cache_lru(page, mapping, index,
392                                                 mapping_gfp_mask(mapping)))
393                         goto find_page;
394         } else {
395 find_page:
396                 ret = -ENOMEM;
397                 page = find_or_create_page(mapping, index,
398                                                 mapping_gfp_mask(mapping));
399                 if (!page)
400                         goto out;
401
402                 /*
403                  * If the page is uptodate, it is also locked. If it isn't
404                  * uptodate, we can mark it uptodate if we are filling the
405                  * full page. Otherwise we need to read it in first...
406                  */
407                 if (!PageUptodate(page)) {
408                         if (sd->len < PAGE_CACHE_SIZE) {
409                                 ret = mapping->a_ops->readpage(file, page);
410                                 if (unlikely(ret))
411                                         goto out;
412
413                                 lock_page(page);
414
415                                 if (!PageUptodate(page)) {
416                                         /*
417                                          * page got invalidated, repeat
418                                          */
419                                         if (!page->mapping) {
420                                                 unlock_page(page);
421                                                 page_cache_release(page);
422                                                 goto find_page;
423                                         }
424                                         ret = -EIO;
425                                         goto out;
426                                 }
427                         } else {
428                                 WARN_ON(!PageLocked(page));
429                                 SetPageUptodate(page);
430                         }
431                 }
432         }
433
434         ret = mapping->a_ops->prepare_write(file, page, 0, sd->len);
435         if (ret)
436                 goto out;
437
438         if (!buf->stolen) {
439                 char *dst = kmap_atomic(page, KM_USER0);
440
441                 memcpy(dst + offset, src + buf->offset, sd->len);
442                 flush_dcache_page(page);
443                 kunmap_atomic(dst, KM_USER0);
444         }
445
446         ret = mapping->a_ops->commit_write(file, page, 0, sd->len);
447         if (ret < 0)
448                 goto out;
449
450         set_page_dirty(page);
451         ret = write_one_page(page, 0);
452 out:
453         if (ret < 0)
454                 unlock_page(page);
455         if (!buf->stolen)
456                 page_cache_release(page);
457         buf->ops->unmap(info, buf);
458         return ret;
459 }
460
461 typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
462                            struct splice_desc *);
463
464 static ssize_t move_from_pipe(struct inode *inode, struct file *out,
465                               size_t len, unsigned int flags,
466                               splice_actor *actor)
467 {
468         struct pipe_inode_info *info;
469         int ret, do_wakeup, err;
470         struct splice_desc sd;
471
472         ret = 0;
473         do_wakeup = 0;
474
475         sd.total_len = len;
476         sd.flags = flags;
477         sd.file = out;
478         sd.pos = out->f_pos;
479
480         mutex_lock(PIPE_MUTEX(*inode));
481
482         info = inode->i_pipe;
483         for (;;) {
484                 int bufs = info->nrbufs;
485
486                 if (bufs) {
487                         int curbuf = info->curbuf;
488                         struct pipe_buffer *buf = info->bufs + curbuf;
489                         struct pipe_buf_operations *ops = buf->ops;
490
491                         sd.len = buf->len;
492                         if (sd.len > sd.total_len)
493                                 sd.len = sd.total_len;
494
495                         err = actor(info, buf, &sd);
496                         if (err) {
497                                 if (!ret && err != -ENODATA)
498                                         ret = err;
499
500                                 break;
501                         }
502
503                         ret += sd.len;
504                         buf->offset += sd.len;
505                         buf->len -= sd.len;
506                         if (!buf->len) {
507                                 buf->ops = NULL;
508                                 ops->release(info, buf);
509                                 curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1);
510                                 info->curbuf = curbuf;
511                                 info->nrbufs = --bufs;
512                                 do_wakeup = 1;
513                         }
514
515                         sd.pos += sd.len;
516                         sd.total_len -= sd.len;
517                         if (!sd.total_len)
518                                 break;
519                 }
520
521                 if (bufs)
522                         continue;
523                 if (!PIPE_WRITERS(*inode))
524                         break;
525                 if (!PIPE_WAITING_WRITERS(*inode)) {
526                         if (ret)
527                                 break;
528                 }
529
530                 if (signal_pending(current)) {
531                         if (!ret)
532                                 ret = -ERESTARTSYS;
533                         break;
534                 }
535
536                 if (do_wakeup) {
537                         wake_up_interruptible_sync(PIPE_WAIT(*inode));
538                         kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT);
539                         do_wakeup = 0;
540                 }
541
542                 pipe_wait(inode);
543         }
544
545         mutex_unlock(PIPE_MUTEX(*inode));
546
547         if (do_wakeup) {
548                 wake_up_interruptible(PIPE_WAIT(*inode));
549                 kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
550         }
551
552         mutex_lock(&out->f_mapping->host->i_mutex);
553         out->f_pos = sd.pos;
554         mutex_unlock(&out->f_mapping->host->i_mutex);
555         return ret;
556
557 }
558
559 ssize_t generic_file_splice_write(struct inode *inode, struct file *out,
560                                   size_t len, unsigned int flags)
561 {
562         return move_from_pipe(inode, out, len, flags, pipe_to_file);
563 }
564
565 ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
566                                 size_t len, unsigned int flags)
567 {
568         return move_from_pipe(inode, out, len, flags, pipe_to_sendpage);
569 }
570
571 EXPORT_SYMBOL(generic_file_splice_write);
572 EXPORT_SYMBOL(generic_file_splice_read);
573
574 static long do_splice_from(struct inode *pipe, struct file *out, size_t len,
575                            unsigned int flags)
576 {
577         loff_t pos;
578         int ret;
579
580         if (!out->f_op || !out->f_op->splice_write)
581                 return -EINVAL;
582
583         if (!(out->f_mode & FMODE_WRITE))
584                 return -EBADF;
585
586         pos = out->f_pos;
587         ret = rw_verify_area(WRITE, out, &pos, len);
588         if (unlikely(ret < 0))
589                 return ret;
590
591         return out->f_op->splice_write(pipe, out, len, flags);
592 }
593
594 static long do_splice_to(struct file *in, struct inode *pipe, size_t len,
595                          unsigned int flags)
596 {
597         loff_t pos, isize, left;
598         int ret;
599
600         if (!in->f_op || !in->f_op->splice_read)
601                 return -EINVAL;
602
603         if (!(in->f_mode & FMODE_READ))
604                 return -EBADF;
605
606         pos = in->f_pos;
607         ret = rw_verify_area(READ, in, &pos, len);
608         if (unlikely(ret < 0))
609                 return ret;
610
611         isize = i_size_read(in->f_mapping->host);
612         if (unlikely(in->f_pos >= isize))
613                 return 0;
614         
615         left = isize - in->f_pos;
616         if (left < len)
617                 len = left;
618
619         return in->f_op->splice_read(in, pipe, len, flags);
620 }
621
622 static long do_splice(struct file *in, struct file *out, size_t len,
623                       unsigned int flags)
624 {
625         struct inode *pipe;
626
627         pipe = in->f_dentry->d_inode;
628         if (pipe->i_pipe)
629                 return do_splice_from(pipe, out, len, flags);
630
631         pipe = out->f_dentry->d_inode;
632         if (pipe->i_pipe)
633                 return do_splice_to(in, pipe, len, flags);
634
635         return -EINVAL;
636 }
637
638 asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
639 {
640         long error;
641         struct file *in, *out;
642         int fput_in, fput_out;
643
644         if (unlikely(!len))
645                 return 0;
646
647         error = -EBADF;
648         in = fget_light(fdin, &fput_in);
649         if (in) {
650                 if (in->f_mode & FMODE_READ) {
651                         out = fget_light(fdout, &fput_out);
652                         if (out) {
653                                 if (out->f_mode & FMODE_WRITE)
654                                         error = do_splice(in, out, len, flags);
655                                 fput_light(out, fput_out);
656                         }
657                 }
658
659                 fput_light(in, fput_in);
660         }
661
662         return error;
663 }