pipe: change the ->pin() operation to ->confirm()
[safe/jmp/linux-2.6] / fs / pipe.c
1 /*
2  *  linux/fs/pipe.c
3  *
4  *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
5  */
6
7 #include <linux/mm.h>
8 #include <linux/file.h>
9 #include <linux/poll.h>
10 #include <linux/slab.h>
11 #include <linux/module.h>
12 #include <linux/init.h>
13 #include <linux/fs.h>
14 #include <linux/mount.h>
15 #include <linux/pipe_fs_i.h>
16 #include <linux/uio.h>
17 #include <linux/highmem.h>
18 #include <linux/pagemap.h>
19 #include <linux/audit.h>
20
21 #include <asm/uaccess.h>
22 #include <asm/ioctls.h>
23
24 /*
25  * We use a start+len construction, which provides full use of the 
26  * allocated memory.
27  * -- Florian Coosmann (FGC)
28  * 
29  * Reads with count = 0 should always return 0.
30  * -- Julian Bradfield 1999-06-07.
31  *
32  * FIFOs and Pipes now generate SIGIO for both readers and writers.
33  * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
34  *
35  * pipe_read & write cleanup
36  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
37  */
38
39 /* Drop the inode semaphore and wait for a pipe event, atomically */
40 void pipe_wait(struct pipe_inode_info *pipe)
41 {
42         DEFINE_WAIT(wait);
43
44         /*
45          * Pipes are system-local resources, so sleeping on them
46          * is considered a noninteractive wait:
47          */
48         prepare_to_wait(&pipe->wait, &wait,
49                         TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
50         if (pipe->inode)
51                 mutex_unlock(&pipe->inode->i_mutex);
52         schedule();
53         finish_wait(&pipe->wait, &wait);
54         if (pipe->inode)
55                 mutex_lock(&pipe->inode->i_mutex);
56 }
57
58 static int
59 pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
60                         int atomic)
61 {
62         unsigned long copy;
63
64         while (len > 0) {
65                 while (!iov->iov_len)
66                         iov++;
67                 copy = min_t(unsigned long, len, iov->iov_len);
68
69                 if (atomic) {
70                         if (__copy_from_user_inatomic(to, iov->iov_base, copy))
71                                 return -EFAULT;
72                 } else {
73                         if (copy_from_user(to, iov->iov_base, copy))
74                                 return -EFAULT;
75                 }
76                 to += copy;
77                 len -= copy;
78                 iov->iov_base += copy;
79                 iov->iov_len -= copy;
80         }
81         return 0;
82 }
83
84 static int
85 pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len,
86                       int atomic)
87 {
88         unsigned long copy;
89
90         while (len > 0) {
91                 while (!iov->iov_len)
92                         iov++;
93                 copy = min_t(unsigned long, len, iov->iov_len);
94
95                 if (atomic) {
96                         if (__copy_to_user_inatomic(iov->iov_base, from, copy))
97                                 return -EFAULT;
98                 } else {
99                         if (copy_to_user(iov->iov_base, from, copy))
100                                 return -EFAULT;
101                 }
102                 from += copy;
103                 len -= copy;
104                 iov->iov_base += copy;
105                 iov->iov_len -= copy;
106         }
107         return 0;
108 }
109
110 /*
111  * Attempt to pre-fault in the user memory, so we can use atomic copies.
112  * Returns the number of bytes not faulted in.
113  */
114 static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
115 {
116         while (!iov->iov_len)
117                 iov++;
118
119         while (len > 0) {
120                 unsigned long this_len;
121
122                 this_len = min_t(unsigned long, len, iov->iov_len);
123                 if (fault_in_pages_writeable(iov->iov_base, this_len))
124                         break;
125
126                 len -= this_len;
127                 iov++;
128         }
129
130         return len;
131 }
132
133 /*
134  * Pre-fault in the user memory, so we can use atomic copies.
135  */
136 static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len)
137 {
138         while (!iov->iov_len)
139                 iov++;
140
141         while (len > 0) {
142                 unsigned long this_len;
143
144                 this_len = min_t(unsigned long, len, iov->iov_len);
145                 fault_in_pages_readable(iov->iov_base, this_len);
146                 len -= this_len;
147                 iov++;
148         }
149 }
150
151 static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
152                                   struct pipe_buffer *buf)
153 {
154         struct page *page = buf->page;
155
156         /*
157          * If nobody else uses this page, and we don't already have a
158          * temporary page, let's keep track of it as a one-deep
159          * allocation cache. (Otherwise just release our reference to it)
160          */
161         if (page_count(page) == 1 && !pipe->tmp_page)
162                 pipe->tmp_page = page;
163         else
164                 page_cache_release(page);
165 }
166
167 void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
168                            struct pipe_buffer *buf, int atomic)
169 {
170         if (atomic) {
171                 buf->flags |= PIPE_BUF_FLAG_ATOMIC;
172                 return kmap_atomic(buf->page, KM_USER0);
173         }
174
175         return kmap(buf->page);
176 }
177
178 void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
179                             struct pipe_buffer *buf, void *map_data)
180 {
181         if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
182                 buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
183                 kunmap_atomic(map_data, KM_USER0);
184         } else
185                 kunmap(buf->page);
186 }
187
188 int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
189                            struct pipe_buffer *buf)
190 {
191         struct page *page = buf->page;
192
193         if (page_count(page) == 1) {
194                 lock_page(page);
195                 return 0;
196         }
197
198         return 1;
199 }
200
201 void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf)
202 {
203         page_cache_get(buf->page);
204 }
205
206 int generic_pipe_buf_confirm(struct pipe_inode_info *info,
207                              struct pipe_buffer *buf)
208 {
209         return 0;
210 }
211
212 static const struct pipe_buf_operations anon_pipe_buf_ops = {
213         .can_merge = 1,
214         .map = generic_pipe_buf_map,
215         .unmap = generic_pipe_buf_unmap,
216         .confirm = generic_pipe_buf_confirm,
217         .release = anon_pipe_buf_release,
218         .steal = generic_pipe_buf_steal,
219         .get = generic_pipe_buf_get,
220 };
221
222 static ssize_t
223 pipe_read(struct kiocb *iocb, const struct iovec *_iov,
224            unsigned long nr_segs, loff_t pos)
225 {
226         struct file *filp = iocb->ki_filp;
227         struct inode *inode = filp->f_path.dentry->d_inode;
228         struct pipe_inode_info *pipe;
229         int do_wakeup;
230         ssize_t ret;
231         struct iovec *iov = (struct iovec *)_iov;
232         size_t total_len;
233
234         total_len = iov_length(iov, nr_segs);
235         /* Null read succeeds. */
236         if (unlikely(total_len == 0))
237                 return 0;
238
239         do_wakeup = 0;
240         ret = 0;
241         mutex_lock(&inode->i_mutex);
242         pipe = inode->i_pipe;
243         for (;;) {
244                 int bufs = pipe->nrbufs;
245                 if (bufs) {
246                         int curbuf = pipe->curbuf;
247                         struct pipe_buffer *buf = pipe->bufs + curbuf;
248                         const struct pipe_buf_operations *ops = buf->ops;
249                         void *addr;
250                         size_t chars = buf->len;
251                         int error, atomic;
252
253                         if (chars > total_len)
254                                 chars = total_len;
255
256                         error = ops->confirm(pipe, buf);
257                         if (error) {
258                                 if (!ret)
259                                         error = ret;
260                                 break;
261                         }
262
263                         atomic = !iov_fault_in_pages_write(iov, chars);
264 redo:
265                         addr = ops->map(pipe, buf, atomic);
266                         error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
267                         ops->unmap(pipe, buf, addr);
268                         if (unlikely(error)) {
269                                 /*
270                                  * Just retry with the slow path if we failed.
271                                  */
272                                 if (atomic) {
273                                         atomic = 0;
274                                         goto redo;
275                                 }
276                                 if (!ret)
277                                         ret = error;
278                                 break;
279                         }
280                         ret += chars;
281                         buf->offset += chars;
282                         buf->len -= chars;
283                         if (!buf->len) {
284                                 buf->ops = NULL;
285                                 ops->release(pipe, buf);
286                                 curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
287                                 pipe->curbuf = curbuf;
288                                 pipe->nrbufs = --bufs;
289                                 do_wakeup = 1;
290                         }
291                         total_len -= chars;
292                         if (!total_len)
293                                 break;  /* common path: read succeeded */
294                 }
295                 if (bufs)       /* More to do? */
296                         continue;
297                 if (!pipe->writers)
298                         break;
299                 if (!pipe->waiting_writers) {
300                         /* syscall merging: Usually we must not sleep
301                          * if O_NONBLOCK is set, or if we got some data.
302                          * But if a writer sleeps in kernel space, then
303                          * we can wait for that data without violating POSIX.
304                          */
305                         if (ret)
306                                 break;
307                         if (filp->f_flags & O_NONBLOCK) {
308                                 ret = -EAGAIN;
309                                 break;
310                         }
311                 }
312                 if (signal_pending(current)) {
313                         if (!ret)
314                                 ret = -ERESTARTSYS;
315                         break;
316                 }
317                 if (do_wakeup) {
318                         wake_up_interruptible_sync(&pipe->wait);
319                         kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
320                 }
321                 pipe_wait(pipe);
322         }
323         mutex_unlock(&inode->i_mutex);
324
325         /* Signal writers asynchronously that there is more room. */
326         if (do_wakeup) {
327                 wake_up_interruptible(&pipe->wait);
328                 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
329         }
330         if (ret > 0)
331                 file_accessed(filp);
332         return ret;
333 }
334
335 static ssize_t
336 pipe_write(struct kiocb *iocb, const struct iovec *_iov,
337             unsigned long nr_segs, loff_t ppos)
338 {
339         struct file *filp = iocb->ki_filp;
340         struct inode *inode = filp->f_path.dentry->d_inode;
341         struct pipe_inode_info *pipe;
342         ssize_t ret;
343         int do_wakeup;
344         struct iovec *iov = (struct iovec *)_iov;
345         size_t total_len;
346         ssize_t chars;
347
348         total_len = iov_length(iov, nr_segs);
349         /* Null write succeeds. */
350         if (unlikely(total_len == 0))
351                 return 0;
352
353         do_wakeup = 0;
354         ret = 0;
355         mutex_lock(&inode->i_mutex);
356         pipe = inode->i_pipe;
357
358         if (!pipe->readers) {
359                 send_sig(SIGPIPE, current, 0);
360                 ret = -EPIPE;
361                 goto out;
362         }
363
364         /* We try to merge small writes */
365         chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
366         if (pipe->nrbufs && chars != 0) {
367                 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
368                                                         (PIPE_BUFFERS-1);
369                 struct pipe_buffer *buf = pipe->bufs + lastbuf;
370                 const struct pipe_buf_operations *ops = buf->ops;
371                 int offset = buf->offset + buf->len;
372
373                 if (ops->can_merge && offset + chars <= PAGE_SIZE) {
374                         int error, atomic = 1;
375                         void *addr;
376
377                         error = ops->confirm(pipe, buf);
378                         if (error)
379                                 goto out;
380
381                         iov_fault_in_pages_read(iov, chars);
382 redo1:
383                         addr = ops->map(pipe, buf, atomic);
384                         error = pipe_iov_copy_from_user(offset + addr, iov,
385                                                         chars, atomic);
386                         ops->unmap(pipe, buf, addr);
387                         ret = error;
388                         do_wakeup = 1;
389                         if (error) {
390                                 if (atomic) {
391                                         atomic = 0;
392                                         goto redo1;
393                                 }
394                                 goto out;
395                         }
396                         buf->len += chars;
397                         total_len -= chars;
398                         ret = chars;
399                         if (!total_len)
400                                 goto out;
401                 }
402         }
403
404         for (;;) {
405                 int bufs;
406
407                 if (!pipe->readers) {
408                         send_sig(SIGPIPE, current, 0);
409                         if (!ret)
410                                 ret = -EPIPE;
411                         break;
412                 }
413                 bufs = pipe->nrbufs;
414                 if (bufs < PIPE_BUFFERS) {
415                         int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
416                         struct pipe_buffer *buf = pipe->bufs + newbuf;
417                         struct page *page = pipe->tmp_page;
418                         char *src;
419                         int error, atomic = 1;
420
421                         if (!page) {
422                                 page = alloc_page(GFP_HIGHUSER);
423                                 if (unlikely(!page)) {
424                                         ret = ret ? : -ENOMEM;
425                                         break;
426                                 }
427                                 pipe->tmp_page = page;
428                         }
429                         /* Always wake up, even if the copy fails. Otherwise
430                          * we lock up (O_NONBLOCK-)readers that sleep due to
431                          * syscall merging.
432                          * FIXME! Is this really true?
433                          */
434                         do_wakeup = 1;
435                         chars = PAGE_SIZE;
436                         if (chars > total_len)
437                                 chars = total_len;
438
439                         iov_fault_in_pages_read(iov, chars);
440 redo2:
441                         if (atomic)
442                                 src = kmap_atomic(page, KM_USER0);
443                         else
444                                 src = kmap(page);
445
446                         error = pipe_iov_copy_from_user(src, iov, chars,
447                                                         atomic);
448                         if (atomic)
449                                 kunmap_atomic(src, KM_USER0);
450                         else
451                                 kunmap(page);
452
453                         if (unlikely(error)) {
454                                 if (atomic) {
455                                         atomic = 0;
456                                         goto redo2;
457                                 }
458                                 if (!ret)
459                                         ret = error;
460                                 break;
461                         }
462                         ret += chars;
463
464                         /* Insert it into the buffer array */
465                         buf->page = page;
466                         buf->ops = &anon_pipe_buf_ops;
467                         buf->offset = 0;
468                         buf->len = chars;
469                         pipe->nrbufs = ++bufs;
470                         pipe->tmp_page = NULL;
471
472                         total_len -= chars;
473                         if (!total_len)
474                                 break;
475                 }
476                 if (bufs < PIPE_BUFFERS)
477                         continue;
478                 if (filp->f_flags & O_NONBLOCK) {
479                         if (!ret)
480                                 ret = -EAGAIN;
481                         break;
482                 }
483                 if (signal_pending(current)) {
484                         if (!ret)
485                                 ret = -ERESTARTSYS;
486                         break;
487                 }
488                 if (do_wakeup) {
489                         wake_up_interruptible_sync(&pipe->wait);
490                         kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
491                         do_wakeup = 0;
492                 }
493                 pipe->waiting_writers++;
494                 pipe_wait(pipe);
495                 pipe->waiting_writers--;
496         }
497 out:
498         mutex_unlock(&inode->i_mutex);
499         if (do_wakeup) {
500                 wake_up_interruptible(&pipe->wait);
501                 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
502         }
503         if (ret > 0)
504                 file_update_time(filp);
505         return ret;
506 }
507
508 static ssize_t
509 bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
510 {
511         return -EBADF;
512 }
513
514 static ssize_t
515 bad_pipe_w(struct file *filp, const char __user *buf, size_t count,
516            loff_t *ppos)
517 {
518         return -EBADF;
519 }
520
521 static int
522 pipe_ioctl(struct inode *pino, struct file *filp,
523            unsigned int cmd, unsigned long arg)
524 {
525         struct inode *inode = filp->f_path.dentry->d_inode;
526         struct pipe_inode_info *pipe;
527         int count, buf, nrbufs;
528
529         switch (cmd) {
530                 case FIONREAD:
531                         mutex_lock(&inode->i_mutex);
532                         pipe = inode->i_pipe;
533                         count = 0;
534                         buf = pipe->curbuf;
535                         nrbufs = pipe->nrbufs;
536                         while (--nrbufs >= 0) {
537                                 count += pipe->bufs[buf].len;
538                                 buf = (buf+1) & (PIPE_BUFFERS-1);
539                         }
540                         mutex_unlock(&inode->i_mutex);
541
542                         return put_user(count, (int __user *)arg);
543                 default:
544                         return -EINVAL;
545         }
546 }
547
548 /* No kernel lock held - fine */
549 static unsigned int
550 pipe_poll(struct file *filp, poll_table *wait)
551 {
552         unsigned int mask;
553         struct inode *inode = filp->f_path.dentry->d_inode;
554         struct pipe_inode_info *pipe = inode->i_pipe;
555         int nrbufs;
556
557         poll_wait(filp, &pipe->wait, wait);
558
559         /* Reading only -- no need for acquiring the semaphore.  */
560         nrbufs = pipe->nrbufs;
561         mask = 0;
562         if (filp->f_mode & FMODE_READ) {
563                 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
564                 if (!pipe->writers && filp->f_version != pipe->w_counter)
565                         mask |= POLLHUP;
566         }
567
568         if (filp->f_mode & FMODE_WRITE) {
569                 mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
570                 /*
571                  * Most Unices do not set POLLERR for FIFOs but on Linux they
572                  * behave exactly like pipes for poll().
573                  */
574                 if (!pipe->readers)
575                         mask |= POLLERR;
576         }
577
578         return mask;
579 }
580
581 static int
582 pipe_release(struct inode *inode, int decr, int decw)
583 {
584         struct pipe_inode_info *pipe;
585
586         mutex_lock(&inode->i_mutex);
587         pipe = inode->i_pipe;
588         pipe->readers -= decr;
589         pipe->writers -= decw;
590
591         if (!pipe->readers && !pipe->writers) {
592                 free_pipe_info(inode);
593         } else {
594                 wake_up_interruptible(&pipe->wait);
595                 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
596                 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
597         }
598         mutex_unlock(&inode->i_mutex);
599
600         return 0;
601 }
602
603 static int
604 pipe_read_fasync(int fd, struct file *filp, int on)
605 {
606         struct inode *inode = filp->f_path.dentry->d_inode;
607         int retval;
608
609         mutex_lock(&inode->i_mutex);
610         retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
611         mutex_unlock(&inode->i_mutex);
612
613         if (retval < 0)
614                 return retval;
615
616         return 0;
617 }
618
619
620 static int
621 pipe_write_fasync(int fd, struct file *filp, int on)
622 {
623         struct inode *inode = filp->f_path.dentry->d_inode;
624         int retval;
625
626         mutex_lock(&inode->i_mutex);
627         retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
628         mutex_unlock(&inode->i_mutex);
629
630         if (retval < 0)
631                 return retval;
632
633         return 0;
634 }
635
636
637 static int
638 pipe_rdwr_fasync(int fd, struct file *filp, int on)
639 {
640         struct inode *inode = filp->f_path.dentry->d_inode;
641         struct pipe_inode_info *pipe = inode->i_pipe;
642         int retval;
643
644         mutex_lock(&inode->i_mutex);
645
646         retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
647
648         if (retval >= 0)
649                 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
650
651         mutex_unlock(&inode->i_mutex);
652
653         if (retval < 0)
654                 return retval;
655
656         return 0;
657 }
658
659
660 static int
661 pipe_read_release(struct inode *inode, struct file *filp)
662 {
663         pipe_read_fasync(-1, filp, 0);
664         return pipe_release(inode, 1, 0);
665 }
666
667 static int
668 pipe_write_release(struct inode *inode, struct file *filp)
669 {
670         pipe_write_fasync(-1, filp, 0);
671         return pipe_release(inode, 0, 1);
672 }
673
674 static int
675 pipe_rdwr_release(struct inode *inode, struct file *filp)
676 {
677         int decr, decw;
678
679         pipe_rdwr_fasync(-1, filp, 0);
680         decr = (filp->f_mode & FMODE_READ) != 0;
681         decw = (filp->f_mode & FMODE_WRITE) != 0;
682         return pipe_release(inode, decr, decw);
683 }
684
685 static int
686 pipe_read_open(struct inode *inode, struct file *filp)
687 {
688         /* We could have perhaps used atomic_t, but this and friends
689            below are the only places.  So it doesn't seem worthwhile.  */
690         mutex_lock(&inode->i_mutex);
691         inode->i_pipe->readers++;
692         mutex_unlock(&inode->i_mutex);
693
694         return 0;
695 }
696
697 static int
698 pipe_write_open(struct inode *inode, struct file *filp)
699 {
700         mutex_lock(&inode->i_mutex);
701         inode->i_pipe->writers++;
702         mutex_unlock(&inode->i_mutex);
703
704         return 0;
705 }
706
707 static int
708 pipe_rdwr_open(struct inode *inode, struct file *filp)
709 {
710         mutex_lock(&inode->i_mutex);
711         if (filp->f_mode & FMODE_READ)
712                 inode->i_pipe->readers++;
713         if (filp->f_mode & FMODE_WRITE)
714                 inode->i_pipe->writers++;
715         mutex_unlock(&inode->i_mutex);
716
717         return 0;
718 }
719
720 /*
721  * The file_operations structs are not static because they
722  * are also used in linux/fs/fifo.c to do operations on FIFOs.
723  */
724 const struct file_operations read_fifo_fops = {
725         .llseek         = no_llseek,
726         .read           = do_sync_read,
727         .aio_read       = pipe_read,
728         .write          = bad_pipe_w,
729         .poll           = pipe_poll,
730         .ioctl          = pipe_ioctl,
731         .open           = pipe_read_open,
732         .release        = pipe_read_release,
733         .fasync         = pipe_read_fasync,
734 };
735
736 const struct file_operations write_fifo_fops = {
737         .llseek         = no_llseek,
738         .read           = bad_pipe_r,
739         .write          = do_sync_write,
740         .aio_write      = pipe_write,
741         .poll           = pipe_poll,
742         .ioctl          = pipe_ioctl,
743         .open           = pipe_write_open,
744         .release        = pipe_write_release,
745         .fasync         = pipe_write_fasync,
746 };
747
748 const struct file_operations rdwr_fifo_fops = {
749         .llseek         = no_llseek,
750         .read           = do_sync_read,
751         .aio_read       = pipe_read,
752         .write          = do_sync_write,
753         .aio_write      = pipe_write,
754         .poll           = pipe_poll,
755         .ioctl          = pipe_ioctl,
756         .open           = pipe_rdwr_open,
757         .release        = pipe_rdwr_release,
758         .fasync         = pipe_rdwr_fasync,
759 };
760
761 static const struct file_operations read_pipe_fops = {
762         .llseek         = no_llseek,
763         .read           = do_sync_read,
764         .aio_read       = pipe_read,
765         .write          = bad_pipe_w,
766         .poll           = pipe_poll,
767         .ioctl          = pipe_ioctl,
768         .open           = pipe_read_open,
769         .release        = pipe_read_release,
770         .fasync         = pipe_read_fasync,
771 };
772
773 static const struct file_operations write_pipe_fops = {
774         .llseek         = no_llseek,
775         .read           = bad_pipe_r,
776         .write          = do_sync_write,
777         .aio_write      = pipe_write,
778         .poll           = pipe_poll,
779         .ioctl          = pipe_ioctl,
780         .open           = pipe_write_open,
781         .release        = pipe_write_release,
782         .fasync         = pipe_write_fasync,
783 };
784
785 static const struct file_operations rdwr_pipe_fops = {
786         .llseek         = no_llseek,
787         .read           = do_sync_read,
788         .aio_read       = pipe_read,
789         .write          = do_sync_write,
790         .aio_write      = pipe_write,
791         .poll           = pipe_poll,
792         .ioctl          = pipe_ioctl,
793         .open           = pipe_rdwr_open,
794         .release        = pipe_rdwr_release,
795         .fasync         = pipe_rdwr_fasync,
796 };
797
798 struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
799 {
800         struct pipe_inode_info *pipe;
801
802         pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
803         if (pipe) {
804                 init_waitqueue_head(&pipe->wait);
805                 pipe->r_counter = pipe->w_counter = 1;
806                 pipe->inode = inode;
807         }
808
809         return pipe;
810 }
811
812 void __free_pipe_info(struct pipe_inode_info *pipe)
813 {
814         int i;
815
816         for (i = 0; i < PIPE_BUFFERS; i++) {
817                 struct pipe_buffer *buf = pipe->bufs + i;
818                 if (buf->ops)
819                         buf->ops->release(pipe, buf);
820         }
821         if (pipe->tmp_page)
822                 __free_page(pipe->tmp_page);
823         kfree(pipe);
824 }
825
826 void free_pipe_info(struct inode *inode)
827 {
828         __free_pipe_info(inode->i_pipe);
829         inode->i_pipe = NULL;
830 }
831
832 static struct vfsmount *pipe_mnt __read_mostly;
833 static int pipefs_delete_dentry(struct dentry *dentry)
834 {
835         /*
836          * At creation time, we pretended this dentry was hashed
837          * (by clearing DCACHE_UNHASHED bit in d_flags)
838          * At delete time, we restore the truth : not hashed.
839          * (so that dput() can proceed correctly)
840          */
841         dentry->d_flags |= DCACHE_UNHASHED;
842         return 0;
843 }
844
845 /*
846  * pipefs_dname() is called from d_path().
847  */
848 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
849 {
850         return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
851                                 dentry->d_inode->i_ino);
852 }
853
854 static struct dentry_operations pipefs_dentry_operations = {
855         .d_delete       = pipefs_delete_dentry,
856         .d_dname        = pipefs_dname,
857 };
858
859 static struct inode * get_pipe_inode(void)
860 {
861         struct inode *inode = new_inode(pipe_mnt->mnt_sb);
862         struct pipe_inode_info *pipe;
863
864         if (!inode)
865                 goto fail_inode;
866
867         pipe = alloc_pipe_info(inode);
868         if (!pipe)
869                 goto fail_iput;
870         inode->i_pipe = pipe;
871
872         pipe->readers = pipe->writers = 1;
873         inode->i_fop = &rdwr_pipe_fops;
874
875         /*
876          * Mark the inode dirty from the very beginning,
877          * that way it will never be moved to the dirty
878          * list because "mark_inode_dirty()" will think
879          * that it already _is_ on the dirty list.
880          */
881         inode->i_state = I_DIRTY;
882         inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
883         inode->i_uid = current->fsuid;
884         inode->i_gid = current->fsgid;
885         inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
886
887         return inode;
888
889 fail_iput:
890         iput(inode);
891
892 fail_inode:
893         return NULL;
894 }
895
896 struct file *create_write_pipe(void)
897 {
898         int err;
899         struct inode *inode;
900         struct file *f;
901         struct dentry *dentry;
902         struct qstr name = { .name = "" };
903
904         f = get_empty_filp();
905         if (!f)
906                 return ERR_PTR(-ENFILE);
907         err = -ENFILE;
908         inode = get_pipe_inode();
909         if (!inode)
910                 goto err_file;
911
912         err = -ENOMEM;
913         dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name);
914         if (!dentry)
915                 goto err_inode;
916
917         dentry->d_op = &pipefs_dentry_operations;
918         /*
919          * We dont want to publish this dentry into global dentry hash table.
920          * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
921          * This permits a working /proc/$pid/fd/XXX on pipes
922          */
923         dentry->d_flags &= ~DCACHE_UNHASHED;
924         d_instantiate(dentry, inode);
925         f->f_path.mnt = mntget(pipe_mnt);
926         f->f_path.dentry = dentry;
927         f->f_mapping = inode->i_mapping;
928
929         f->f_flags = O_WRONLY;
930         f->f_op = &write_pipe_fops;
931         f->f_mode = FMODE_WRITE;
932         f->f_version = 0;
933
934         return f;
935
936  err_inode:
937         free_pipe_info(inode);
938         iput(inode);
939  err_file:
940         put_filp(f);
941         return ERR_PTR(err);
942 }
943
944 void free_write_pipe(struct file *f)
945 {
946         free_pipe_info(f->f_dentry->d_inode);
947         dput(f->f_path.dentry);
948         mntput(f->f_path.mnt);
949         put_filp(f);
950 }
951
952 struct file *create_read_pipe(struct file *wrf)
953 {
954         struct file *f = get_empty_filp();
955         if (!f)
956                 return ERR_PTR(-ENFILE);
957
958         /* Grab pipe from the writer */
959         f->f_path.mnt = mntget(wrf->f_path.mnt);
960         f->f_path.dentry = dget(wrf->f_path.dentry);
961         f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping;
962
963         f->f_pos = 0;
964         f->f_flags = O_RDONLY;
965         f->f_op = &read_pipe_fops;
966         f->f_mode = FMODE_READ;
967         f->f_version = 0;
968
969         return f;
970 }
971
972 int do_pipe(int *fd)
973 {
974         struct file *fw, *fr;
975         int error;
976         int fdw, fdr;
977
978         fw = create_write_pipe();
979         if (IS_ERR(fw))
980                 return PTR_ERR(fw);
981         fr = create_read_pipe(fw);
982         error = PTR_ERR(fr);
983         if (IS_ERR(fr))
984                 goto err_write_pipe;
985
986         error = get_unused_fd();
987         if (error < 0)
988                 goto err_read_pipe;
989         fdr = error;
990
991         error = get_unused_fd();
992         if (error < 0)
993                 goto err_fdr;
994         fdw = error;
995
996         error = audit_fd_pair(fdr, fdw);
997         if (error < 0)
998                 goto err_fdw;
999
1000         fd_install(fdr, fr);
1001         fd_install(fdw, fw);
1002         fd[0] = fdr;
1003         fd[1] = fdw;
1004
1005         return 0;
1006
1007  err_fdw:
1008         put_unused_fd(fdw);
1009  err_fdr:
1010         put_unused_fd(fdr);
1011  err_read_pipe:
1012         dput(fr->f_dentry);
1013         mntput(fr->f_vfsmnt);
1014         put_filp(fr);
1015  err_write_pipe:
1016         free_write_pipe(fw);
1017         return error;
1018 }
1019
1020 /*
1021  * pipefs should _never_ be mounted by userland - too much of security hassle,
1022  * no real gain from having the whole whorehouse mounted. So we don't need
1023  * any operations on the root directory. However, we need a non-trivial
1024  * d_name - pipe: will go nicely and kill the special-casing in procfs.
1025  */
1026 static int pipefs_get_sb(struct file_system_type *fs_type,
1027                          int flags, const char *dev_name, void *data,
1028                          struct vfsmount *mnt)
1029 {
1030         return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt);
1031 }
1032
1033 static struct file_system_type pipe_fs_type = {
1034         .name           = "pipefs",
1035         .get_sb         = pipefs_get_sb,
1036         .kill_sb        = kill_anon_super,
1037 };
1038
1039 static int __init init_pipe_fs(void)
1040 {
1041         int err = register_filesystem(&pipe_fs_type);
1042
1043         if (!err) {
1044                 pipe_mnt = kern_mount(&pipe_fs_type);
1045                 if (IS_ERR(pipe_mnt)) {
1046                         err = PTR_ERR(pipe_mnt);
1047                         unregister_filesystem(&pipe_fs_type);
1048                 }
1049         }
1050         return err;
1051 }
1052
1053 static void __exit exit_pipe_fs(void)
1054 {
1055         unregister_filesystem(&pipe_fs_type);
1056         mntput(pipe_mnt);
1057 }
1058
1059 fs_initcall(init_pipe_fs);
1060 module_exit(exit_pipe_fs);