[PATCH] splice: add support for SPLICE_F_MOVE flag
[safe/jmp/linux-2.6] / fs / splice.c
1 /*
2  * "splice": joining two ropes together by interweaving their strands.
3  *
4  * This is the "extended pipe" functionality, where a pipe is used as
5  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6  * buffer that you can use to transfer data from one end to the other.
7  *
8  * The traditional unix read/write is extended with a "splice()" operation
9  * that transfers data buffers to or from a pipe buffer.
10  *
11  * Named by Larry McVoy, original implementation from Linus, extended by
12  * Jens to support splicing to files and fixing the initial implementation
13  * bugs.
14  *
15  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
16  * Copyright (C) 2005 Linus Torvalds <torvalds@osdl.org>
17  *
18  */
19 #include <linux/fs.h>
20 #include <linux/file.h>
21 #include <linux/pagemap.h>
22 #include <linux/pipe_fs_i.h>
23 #include <linux/mm_inline.h>
24 #include <linux/swap.h>
25
26 /*
27  * Passed to the actors
28  */
29 struct splice_desc {
30         unsigned int len, total_len;    /* current and remaining length */
31         unsigned int flags;             /* splice flags */
32         struct file *file;              /* file to read/write */
33         loff_t pos;                     /* file position */
34 };
35
36 static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
37                                      struct pipe_buffer *buf)
38 {
39         struct page *page = buf->page;
40
41         WARN_ON(!PageLocked(page));
42         WARN_ON(!PageUptodate(page));
43
44         if (!remove_mapping(page_mapping(page), page))
45                 return 1;
46
47         if (PageLRU(page)) {
48                 struct zone *zone = page_zone(page);
49
50                 spin_lock_irq(&zone->lru_lock);
51                 BUG_ON(!PageLRU(page));
52                 __ClearPageLRU(page);
53                 del_page_from_lru(zone, page);
54                 spin_unlock_irq(&zone->lru_lock);
55         }
56
57         buf->stolen = 1;
58         return 0;
59 }
60
61 static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
62                                         struct pipe_buffer *buf)
63 {
64         page_cache_release(buf->page);
65         buf->page = NULL;
66         buf->stolen = 0;
67 }
68
69 static void *page_cache_pipe_buf_map(struct file *file,
70                                      struct pipe_inode_info *info,
71                                      struct pipe_buffer *buf)
72 {
73         struct page *page = buf->page;
74
75         lock_page(page);
76
77         if (!PageUptodate(page)) {
78                 unlock_page(page);
79                 return ERR_PTR(-EIO);
80         }
81
82         if (!page->mapping) {
83                 unlock_page(page);
84                 return ERR_PTR(-ENODATA);
85         }
86
87         return kmap(buf->page);
88 }
89
90 static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
91                                       struct pipe_buffer *buf)
92 {
93         if (!buf->stolen)
94                 unlock_page(buf->page);
95         kunmap(buf->page);
96 }
97
98 static struct pipe_buf_operations page_cache_pipe_buf_ops = {
99         .can_merge = 0,
100         .map = page_cache_pipe_buf_map,
101         .unmap = page_cache_pipe_buf_unmap,
102         .release = page_cache_pipe_buf_release,
103         .steal = page_cache_pipe_buf_steal,
104 };
105
106 static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
107                             int nr_pages, unsigned long offset,
108                             unsigned long len)
109 {
110         struct pipe_inode_info *info;
111         int ret, do_wakeup, i;
112
113         ret = 0;
114         do_wakeup = 0;
115         i = 0;
116
117         mutex_lock(PIPE_MUTEX(*inode));
118
119         info = inode->i_pipe;
120         for (;;) {
121                 int bufs;
122
123                 if (!PIPE_READERS(*inode)) {
124                         send_sig(SIGPIPE, current, 0);
125                         if (!ret)
126                                 ret = -EPIPE;
127                         break;
128                 }
129
130                 bufs = info->nrbufs;
131                 if (bufs < PIPE_BUFFERS) {
132                         int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1);
133                         struct pipe_buffer *buf = info->bufs + newbuf;
134                         struct page *page = pages[i++];
135                         unsigned long this_len;
136
137                         this_len = PAGE_CACHE_SIZE - offset;
138                         if (this_len > len)
139                                 this_len = len;
140
141                         buf->page = page;
142                         buf->offset = offset;
143                         buf->len = this_len;
144                         buf->ops = &page_cache_pipe_buf_ops;
145                         info->nrbufs = ++bufs;
146                         do_wakeup = 1;
147
148                         ret += this_len;
149                         len -= this_len;
150                         offset = 0;
151                         if (!--nr_pages)
152                                 break;
153                         if (!len)
154                                 break;
155                         if (bufs < PIPE_BUFFERS)
156                                 continue;
157
158                         break;
159                 }
160
161                 if (signal_pending(current)) {
162                         if (!ret)
163                                 ret = -ERESTARTSYS;
164                         break;
165                 }
166
167                 if (do_wakeup) {
168                         wake_up_interruptible_sync(PIPE_WAIT(*inode));
169                         kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO,
170                                     POLL_IN);
171                         do_wakeup = 0;
172                 }
173
174                 PIPE_WAITING_WRITERS(*inode)++;
175                 pipe_wait(inode);
176                 PIPE_WAITING_WRITERS(*inode)--;
177         }
178
179         mutex_unlock(PIPE_MUTEX(*inode));
180
181         if (do_wakeup) {
182                 wake_up_interruptible(PIPE_WAIT(*inode));
183                 kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
184         }
185
186         while (i < nr_pages)
187                 page_cache_release(pages[i++]);
188
189         return ret;
190 }
191
192 static int __generic_file_splice_read(struct file *in, struct inode *pipe,
193                                       size_t len)
194 {
195         struct address_space *mapping = in->f_mapping;
196         unsigned int offset, nr_pages;
197         struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS];
198         struct page *page;
199         pgoff_t index, pidx;
200         int i, j;
201
202         index = in->f_pos >> PAGE_CACHE_SHIFT;
203         offset = in->f_pos & ~PAGE_CACHE_MASK;
204         nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
205
206         if (nr_pages > PIPE_BUFFERS)
207                 nr_pages = PIPE_BUFFERS;
208
209         /*
210          * initiate read-ahead on this page range
211          */
212         do_page_cache_readahead(mapping, in, index, nr_pages);
213
214         /*
215          * Get as many pages from the page cache as possible..
216          * Start IO on the page cache entries we create (we
217          * can assume that any pre-existing ones we find have
218          * already had IO started on them).
219          */
220         i = find_get_pages(mapping, index, nr_pages, pages);
221
222         /*
223          * common case - we found all pages and they are contiguous,
224          * kick them off
225          */
226         if (i && (pages[i - 1]->index == index + i - 1))
227                 goto splice_them;
228
229         /*
230          * fill shadow[] with pages at the right locations, so we only
231          * have to fill holes
232          */
233         memset(shadow, 0, i * sizeof(struct page *));
234         for (j = 0, pidx = index; j < i; pidx++, j++)
235                 shadow[pages[j]->index - pidx] = pages[j];
236
237         /*
238          * now fill in the holes
239          */
240         for (i = 0, pidx = index; i < nr_pages; pidx++, i++) {
241                 int error;
242
243                 if (shadow[i])
244                         continue;
245
246                 /*
247                  * no page there, look one up / create it
248                  */
249                 page = find_or_create_page(mapping, pidx,
250                                                    mapping_gfp_mask(mapping));
251                 if (!page)
252                         break;
253
254                 if (PageUptodate(page))
255                         unlock_page(page);
256                 else {
257                         error = mapping->a_ops->readpage(in, page);
258
259                         if (unlikely(error)) {
260                                 page_cache_release(page);
261                                 break;
262                         }
263                 }
264                 shadow[i] = page;
265         }
266
267         if (!i) {
268                 for (i = 0; i < nr_pages; i++) {
269                          if (shadow[i])
270                                 page_cache_release(shadow[i]);
271                 }
272                 return 0;
273         }
274
275         memcpy(pages, shadow, i * sizeof(struct page *));
276
277         /*
278          * Now we splice them into the pipe..
279          */
280 splice_them:
281         return move_to_pipe(pipe, pages, i, offset, len);
282 }
283
284 ssize_t generic_file_splice_read(struct file *in, struct inode *pipe,
285                                  size_t len, unsigned int flags)
286 {
287         ssize_t spliced;
288         int ret;
289
290         ret = 0;
291         spliced = 0;
292         while (len) {
293                 ret = __generic_file_splice_read(in, pipe, len);
294
295                 if (ret <= 0)
296                         break;
297
298                 in->f_pos += ret;
299                 len -= ret;
300                 spliced += ret;
301         }
302
303         if (spliced)
304                 return spliced;
305
306         return ret;
307 }
308
309 /*
310  * Send 'len' bytes to socket from 'file' at position 'pos' using sendpage().
311  */
312 static int pipe_to_sendpage(struct pipe_inode_info *info,
313                             struct pipe_buffer *buf, struct splice_desc *sd)
314 {
315         struct file *file = sd->file;
316         loff_t pos = sd->pos;
317         unsigned int offset;
318         ssize_t ret;
319         void *ptr;
320
321         /*
322          * sub-optimal, but we are limited by the pipe ->map. we don't
323          * need a kmap'ed buffer here, we just want to make sure we
324          * have the page pinned if the pipe page originates from the
325          * page cache
326          */
327         ptr = buf->ops->map(file, info, buf);
328         if (IS_ERR(ptr))
329                 return PTR_ERR(ptr);
330
331         offset = pos & ~PAGE_CACHE_MASK;
332
333         ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,
334                                         sd->len < sd->total_len);
335
336         buf->ops->unmap(info, buf);
337         if (ret == sd->len)
338                 return 0;
339
340         return -EIO;
341 }
342
343 /*
344  * This is a little more tricky than the file -> pipe splicing. There are
345  * basically three cases:
346  *
347  *      - Destination page already exists in the address space and there
348  *        are users of it. For that case we have no other option that
349  *        copying the data. Tough luck.
350  *      - Destination page already exists in the address space, but there
351  *        are no users of it. Make sure it's uptodate, then drop it. Fall
352  *        through to last case.
353  *      - Destination page does not exist, we can add the pipe page to
354  *        the page cache and avoid the copy.
355  *
356  * For now we just do the slower thing and always copy pages over, it's
357  * easier than migrating pages from the pipe to the target file. For the
358  * case of doing file | file splicing, the migrate approach had some LRU
359  * nastiness...
360  */
361 static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
362                         struct splice_desc *sd)
363 {
364         struct file *file = sd->file;
365         struct address_space *mapping = file->f_mapping;
366         unsigned int offset;
367         struct page *page;
368         pgoff_t index;
369         char *src;
370         int ret;
371
372         /*
373          * after this, page will be locked and unmapped
374          */
375         src = buf->ops->map(file, info, buf);
376         if (IS_ERR(src))
377                 return PTR_ERR(src);
378
379         index = sd->pos >> PAGE_CACHE_SHIFT;
380         offset = sd->pos & ~PAGE_CACHE_MASK;
381
382         /*
383          * reuse buf page, if SPLICE_F_MOVE is set
384          */
385         if (sd->flags & SPLICE_F_MOVE) {
386                 if (buf->ops->steal(info, buf))
387                         goto find_page;
388
389                 page = buf->page;
390                 if (add_to_page_cache_lru(page, mapping, index,
391                                                 mapping_gfp_mask(mapping)))
392                         goto find_page;
393         } else {
394 find_page:
395                 ret = -ENOMEM;
396                 page = find_or_create_page(mapping, index,
397                                                 mapping_gfp_mask(mapping));
398                 if (!page)
399                         goto out;
400
401                 /*
402                  * If the page is uptodate, it is also locked. If it isn't
403                  * uptodate, we can mark it uptodate if we are filling the
404                  * full page. Otherwise we need to read it in first...
405                  */
406                 if (!PageUptodate(page)) {
407                         if (sd->len < PAGE_CACHE_SIZE) {
408                                 ret = mapping->a_ops->readpage(file, page);
409                                 if (unlikely(ret))
410                                         goto out;
411
412                                 lock_page(page);
413
414                                 if (!PageUptodate(page)) {
415                                         /*
416                                          * page got invalidated, repeat
417                                          */
418                                         if (!page->mapping) {
419                                                 unlock_page(page);
420                                                 page_cache_release(page);
421                                                 goto find_page;
422                                         }
423                                         ret = -EIO;
424                                         goto out;
425                                 }
426                         } else {
427                                 WARN_ON(!PageLocked(page));
428                                 SetPageUptodate(page);
429                         }
430                 }
431         }
432
433         ret = mapping->a_ops->prepare_write(file, page, 0, sd->len);
434         if (ret)
435                 goto out;
436
437         if (!buf->stolen) {
438                 char *dst = kmap_atomic(page, KM_USER0);
439
440                 memcpy(dst + offset, src + buf->offset, sd->len);
441                 flush_dcache_page(page);
442                 kunmap_atomic(dst, KM_USER0);
443         }
444
445         ret = mapping->a_ops->commit_write(file, page, 0, sd->len);
446         if (ret < 0)
447                 goto out;
448
449         set_page_dirty(page);
450         ret = write_one_page(page, 0);
451 out:
452         if (ret < 0)
453                 unlock_page(page);
454         if (!buf->stolen)
455                 page_cache_release(page);
456         buf->ops->unmap(info, buf);
457         return ret;
458 }
459
460 typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
461                            struct splice_desc *);
462
463 static ssize_t move_from_pipe(struct inode *inode, struct file *out,
464                               size_t len, unsigned int flags,
465                               splice_actor *actor)
466 {
467         struct pipe_inode_info *info;
468         int ret, do_wakeup, err;
469         struct splice_desc sd;
470
471         ret = 0;
472         do_wakeup = 0;
473
474         sd.total_len = len;
475         sd.flags = flags;
476         sd.file = out;
477         sd.pos = out->f_pos;
478
479         mutex_lock(PIPE_MUTEX(*inode));
480
481         info = inode->i_pipe;
482         for (;;) {
483                 int bufs = info->nrbufs;
484
485                 if (bufs) {
486                         int curbuf = info->curbuf;
487                         struct pipe_buffer *buf = info->bufs + curbuf;
488                         struct pipe_buf_operations *ops = buf->ops;
489
490                         sd.len = buf->len;
491                         if (sd.len > sd.total_len)
492                                 sd.len = sd.total_len;
493
494                         err = actor(info, buf, &sd);
495                         if (err) {
496                                 if (!ret && err != -ENODATA)
497                                         ret = err;
498
499                                 break;
500                         }
501
502                         ret += sd.len;
503                         buf->offset += sd.len;
504                         buf->len -= sd.len;
505                         if (!buf->len) {
506                                 buf->ops = NULL;
507                                 ops->release(info, buf);
508                                 curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1);
509                                 info->curbuf = curbuf;
510                                 info->nrbufs = --bufs;
511                                 do_wakeup = 1;
512                         }
513
514                         sd.pos += sd.len;
515                         sd.total_len -= sd.len;
516                         if (!sd.total_len)
517                                 break;
518                 }
519
520                 if (bufs)
521                         continue;
522                 if (!PIPE_WRITERS(*inode))
523                         break;
524                 if (!PIPE_WAITING_WRITERS(*inode)) {
525                         if (ret)
526                                 break;
527                 }
528
529                 if (signal_pending(current)) {
530                         if (!ret)
531                                 ret = -ERESTARTSYS;
532                         break;
533                 }
534
535                 if (do_wakeup) {
536                         wake_up_interruptible_sync(PIPE_WAIT(*inode));
537                         kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT);
538                         do_wakeup = 0;
539                 }
540
541                 pipe_wait(inode);
542         }
543
544         mutex_unlock(PIPE_MUTEX(*inode));
545
546         if (do_wakeup) {
547                 wake_up_interruptible(PIPE_WAIT(*inode));
548                 kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
549         }
550
551         mutex_lock(&out->f_mapping->host->i_mutex);
552         out->f_pos = sd.pos;
553         mutex_unlock(&out->f_mapping->host->i_mutex);
554         return ret;
555
556 }
557
558 ssize_t generic_file_splice_write(struct inode *inode, struct file *out,
559                                   size_t len, unsigned int flags)
560 {
561         return move_from_pipe(inode, out, len, flags, pipe_to_file);
562 }
563
564 ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
565                                 size_t len, unsigned int flags)
566 {
567         return move_from_pipe(inode, out, len, flags, pipe_to_sendpage);
568 }
569
570 static long do_splice_from(struct inode *pipe, struct file *out, size_t len,
571                            unsigned int flags)
572 {
573         loff_t pos;
574         int ret;
575
576         if (!out->f_op || !out->f_op->splice_write)
577                 return -EINVAL;
578
579         if (!(out->f_mode & FMODE_WRITE))
580                 return -EBADF;
581
582         pos = out->f_pos;
583         ret = rw_verify_area(WRITE, out, &pos, len);
584         if (unlikely(ret < 0))
585                 return ret;
586
587         return out->f_op->splice_write(pipe, out, len, flags);
588 }
589
590 static long do_splice_to(struct file *in, struct inode *pipe, size_t len,
591                          unsigned int flags)
592 {
593         loff_t pos, isize, left;
594         int ret;
595
596         if (!in->f_op || !in->f_op->splice_read)
597                 return -EINVAL;
598
599         if (!(in->f_mode & FMODE_READ))
600                 return -EBADF;
601
602         pos = in->f_pos;
603         ret = rw_verify_area(READ, in, &pos, len);
604         if (unlikely(ret < 0))
605                 return ret;
606
607         isize = i_size_read(in->f_mapping->host);
608         if (unlikely(in->f_pos >= isize))
609                 return 0;
610         
611         left = isize - in->f_pos;
612         if (left < len)
613                 len = left;
614
615         return in->f_op->splice_read(in, pipe, len, flags);
616 }
617
618 static long do_splice(struct file *in, struct file *out, size_t len,
619                       unsigned int flags)
620 {
621         struct inode *pipe;
622
623         pipe = in->f_dentry->d_inode;
624         if (pipe->i_pipe)
625                 return do_splice_from(pipe, out, len, flags);
626
627         pipe = out->f_dentry->d_inode;
628         if (pipe->i_pipe)
629                 return do_splice_to(in, pipe, len, flags);
630
631         return -EINVAL;
632 }
633
634 asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
635 {
636         long error;
637         struct file *in, *out;
638         int fput_in, fput_out;
639
640         if (unlikely(!len))
641                 return 0;
642
643         error = -EBADF;
644         in = fget_light(fdin, &fput_in);
645         if (in) {
646                 if (in->f_mode & FMODE_READ) {
647                         out = fget_light(fdout, &fput_out);
648                         if (out) {
649                                 if (out->f_mode & FMODE_WRITE)
650                                         error = do_splice(in, out, len, flags);
651                                 fput_light(out, fput_out);
652                         }
653                 }
654
655                 fput_light(in, fput_in);
656         }
657
658         return error;
659 }