Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[safe/jmp/linux-2.6] / net / ipv4 / tcp.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
11  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
12  *              Florian La Roche, <flla@stud.uni-sb.de>
13  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
14  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
15  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
16  *              Matthew Dillon, <dillon@apollo.west.oic.com>
17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18  *              Jorge Cwik, <jorge@laser.satlink.net>
19  *
20  * Fixes:
21  *              Alan Cox        :       Numerous verify_area() calls
22  *              Alan Cox        :       Set the ACK bit on a reset
23  *              Alan Cox        :       Stopped it crashing if it closed while
24  *                                      sk->inuse=1 and was trying to connect
25  *                                      (tcp_err()).
26  *              Alan Cox        :       All icmp error handling was broken
27  *                                      pointers passed where wrong and the
28  *                                      socket was looked up backwards. Nobody
29  *                                      tested any icmp error code obviously.
30  *              Alan Cox        :       tcp_err() now handled properly. It
31  *                                      wakes people on errors. poll
32  *                                      behaves and the icmp error race
33  *                                      has gone by moving it into sock.c
34  *              Alan Cox        :       tcp_send_reset() fixed to work for
35  *                                      everything not just packets for
36  *                                      unknown sockets.
37  *              Alan Cox        :       tcp option processing.
38  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
39  *                                      syn rule wrong]
40  *              Herp Rosmanith  :       More reset fixes
41  *              Alan Cox        :       No longer acks invalid rst frames.
42  *                                      Acking any kind of RST is right out.
43  *              Alan Cox        :       Sets an ignore me flag on an rst
44  *                                      receive otherwise odd bits of prattle
45  *                                      escape still
46  *              Alan Cox        :       Fixed another acking RST frame bug.
47  *                                      Should stop LAN workplace lockups.
48  *              Alan Cox        :       Some tidyups using the new skb list
49  *                                      facilities
50  *              Alan Cox        :       sk->keepopen now seems to work
51  *              Alan Cox        :       Pulls options out correctly on accepts
52  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
53  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
54  *                                      bit to skb ops.
55  *              Alan Cox        :       Tidied tcp_data to avoid a potential
56  *                                      nasty.
57  *              Alan Cox        :       Added some better commenting, as the
58  *                                      tcp is hard to follow
59  *              Alan Cox        :       Removed incorrect check for 20 * psh
60  *      Michael O'Reilly        :       ack < copied bug fix.
61  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
62  *              Alan Cox        :       FIN with no memory -> CRASH
63  *              Alan Cox        :       Added socket option proto entries.
64  *                                      Also added awareness of them to accept.
65  *              Alan Cox        :       Added TCP options (SOL_TCP)
66  *              Alan Cox        :       Switched wakeup calls to callbacks,
67  *                                      so the kernel can layer network
68  *                                      sockets.
69  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
70  *              Alan Cox        :       Handle FIN (more) properly (we hope).
71  *              Alan Cox        :       RST frames sent on unsynchronised
72  *                                      state ack error.
73  *              Alan Cox        :       Put in missing check for SYN bit.
74  *              Alan Cox        :       Added tcp_select_window() aka NET2E
75  *                                      window non shrink trick.
76  *              Alan Cox        :       Added a couple of small NET2E timer
77  *                                      fixes
78  *              Charles Hedrick :       TCP fixes
79  *              Toomas Tamm     :       TCP window fixes
80  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
81  *              Charles Hedrick :       Rewrote most of it to actually work
82  *              Linus           :       Rewrote tcp_read() and URG handling
83  *                                      completely
84  *              Gerhard Koerting:       Fixed some missing timer handling
85  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
86  *              Gerhard Koerting:       PC/TCP workarounds
87  *              Adam Caldwell   :       Assorted timer/timing errors
88  *              Matthew Dillon  :       Fixed another RST bug
89  *              Alan Cox        :       Move to kernel side addressing changes.
90  *              Alan Cox        :       Beginning work on TCP fastpathing
91  *                                      (not yet usable)
92  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
93  *              Alan Cox        :       TCP fast path debugging
94  *              Alan Cox        :       Window clamping
95  *              Michael Riepe   :       Bug in tcp_check()
96  *              Matt Dillon     :       More TCP improvements and RST bug fixes
97  *              Matt Dillon     :       Yet more small nasties remove from the
98  *                                      TCP code (Be very nice to this man if
99  *                                      tcp finally works 100%) 8)
100  *              Alan Cox        :       BSD accept semantics.
101  *              Alan Cox        :       Reset on closedown bug.
102  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
103  *              Michael Pall    :       Handle poll() after URG properly in
104  *                                      all cases.
105  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
106  *                                      (multi URG PUSH broke rlogin).
107  *              Michael Pall    :       Fix the multi URG PUSH problem in
108  *                                      tcp_readable(), poll() after URG
109  *                                      works now.
110  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
111  *                                      BSD api.
112  *              Alan Cox        :       Changed the semantics of sk->socket to
113  *                                      fix a race and a signal problem with
114  *                                      accept() and async I/O.
115  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
116  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
117  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
118  *                                      clients/servers which listen in on
119  *                                      fixed ports.
120  *              Alan Cox        :       Cleaned the above up and shrank it to
121  *                                      a sensible code size.
122  *              Alan Cox        :       Self connect lockup fix.
123  *              Alan Cox        :       No connect to multicast.
124  *              Ross Biro       :       Close unaccepted children on master
125  *                                      socket close.
126  *              Alan Cox        :       Reset tracing code.
127  *              Alan Cox        :       Spurious resets on shutdown.
128  *              Alan Cox        :       Giant 15 minute/60 second timer error
129  *              Alan Cox        :       Small whoops in polling before an
130  *                                      accept.
131  *              Alan Cox        :       Kept the state trace facility since
132  *                                      it's handy for debugging.
133  *              Alan Cox        :       More reset handler fixes.
134  *              Alan Cox        :       Started rewriting the code based on
135  *                                      the RFC's for other useful protocol
136  *                                      references see: Comer, KA9Q NOS, and
137  *                                      for a reference on the difference
138  *                                      between specifications and how BSD
139  *                                      works see the 4.4lite source.
140  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
141  *                                      close.
142  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
143  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
144  *              Alan Cox        :       Reimplemented timers as per the RFC
145  *                                      and using multiple timers for sanity.
146  *              Alan Cox        :       Small bug fixes, and a lot of new
147  *                                      comments.
148  *              Alan Cox        :       Fixed dual reader crash by locking
149  *                                      the buffers (much like datagram.c)
150  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
151  *                                      now gets fed up of retrying without
152  *                                      (even a no space) answer.
153  *              Alan Cox        :       Extracted closing code better
154  *              Alan Cox        :       Fixed the closing state machine to
155  *                                      resemble the RFC.
156  *              Alan Cox        :       More 'per spec' fixes.
157  *              Jorge Cwik      :       Even faster checksumming.
158  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
159  *                                      only frames. At least one pc tcp stack
160  *                                      generates them.
161  *              Alan Cox        :       Cache last socket.
162  *              Alan Cox        :       Per route irtt.
163  *              Matt Day        :       poll()->select() match BSD precisely on error
164  *              Alan Cox        :       New buffers
165  *              Marc Tamsky     :       Various sk->prot->retransmits and
166  *                                      sk->retransmits misupdating fixed.
167  *                                      Fixed tcp_write_timeout: stuck close,
168  *                                      and TCP syn retries gets used now.
169  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
170  *                                      ack if state is TCP_CLOSED.
171  *              Alan Cox        :       Look up device on a retransmit - routes may
172  *                                      change. Doesn't yet cope with MSS shrink right
173  *                                      but it's a start!
174  *              Marc Tamsky     :       Closing in closing fixes.
175  *              Mike Shaver     :       RFC1122 verifications.
176  *              Alan Cox        :       rcv_saddr errors.
177  *              Alan Cox        :       Block double connect().
178  *              Alan Cox        :       Small hooks for enSKIP.
179  *              Alexey Kuznetsov:       Path MTU discovery.
180  *              Alan Cox        :       Support soft errors.
181  *              Alan Cox        :       Fix MTU discovery pathological case
182  *                                      when the remote claims no mtu!
183  *              Marc Tamsky     :       TCP_CLOSE fix.
184  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
185  *                                      window but wrong (fixes NT lpd problems)
186  *              Pedro Roque     :       Better TCP window handling, delayed ack.
187  *              Joerg Reuter    :       No modification of locked buffers in
188  *                                      tcp_do_retransmit()
189  *              Eric Schenk     :       Changed receiver side silly window
190  *                                      avoidance algorithm to BSD style
191  *                                      algorithm. This doubles throughput
192  *                                      against machines running Solaris,
193  *                                      and seems to result in general
194  *                                      improvement.
195  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
196  *      Willy Konynenberg       :       Transparent proxying support.
197  *      Mike McLagan            :       Routing by source
198  *              Keith Owens     :       Do proper merging with partial SKB's in
199  *                                      tcp_do_sendmsg to avoid burstiness.
200  *              Eric Schenk     :       Fix fast close down bug with
201  *                                      shutdown() followed by close().
202  *              Andi Kleen      :       Make poll agree with SIGIO
203  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
204  *                                      lingertime == 0 (RFC 793 ABORT Call)
205  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
206  *                                      csum_and_copy_from_user() if possible.
207  *
208  *              This program is free software; you can redistribute it and/or
209  *              modify it under the terms of the GNU General Public License
210  *              as published by the Free Software Foundation; either version
211  *              2 of the License, or(at your option) any later version.
212  *
213  * Description of States:
214  *
215  *      TCP_SYN_SENT            sent a connection request, waiting for ack
216  *
217  *      TCP_SYN_RECV            received a connection request, sent ack,
218  *                              waiting for final ack in three-way handshake.
219  *
220  *      TCP_ESTABLISHED         connection established
221  *
222  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
223  *                              transmission of remaining buffered data
224  *
225  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
226  *                              to shutdown
227  *
228  *      TCP_CLOSING             both sides have shutdown but we still have
229  *                              data we have to finish sending
230  *
231  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
232  *                              closed, can only be entered from FIN_WAIT2
233  *                              or CLOSING.  Required because the other end
234  *                              may not have gotten our last ACK causing it
235  *                              to retransmit the data packet (which we ignore)
236  *
237  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
238  *                              us to finish writing our data and to shutdown
239  *                              (we have to close() to move on to LAST_ACK)
240  *
241  *      TCP_LAST_ACK            out side has shutdown after remote has
242  *                              shutdown.  There may still be data in our
243  *                              buffer that we have to finish sending
244  *
245  *      TCP_CLOSE               socket is finished
246  */
247
248 #include <linux/kernel.h>
249 #include <linux/module.h>
250 #include <linux/types.h>
251 #include <linux/fcntl.h>
252 #include <linux/poll.h>
253 #include <linux/init.h>
254 #include <linux/fs.h>
255 #include <linux/skbuff.h>
256 #include <linux/splice.h>
257 #include <linux/net.h>
258 #include <linux/socket.h>
259 #include <linux/random.h>
260 #include <linux/bootmem.h>
261 #include <linux/cache.h>
262 #include <linux/err.h>
263 #include <linux/crypto.h>
264
265 #include <net/icmp.h>
266 #include <net/tcp.h>
267 #include <net/xfrm.h>
268 #include <net/ip.h>
269 #include <net/netdma.h>
270 #include <net/sock.h>
271
272 #include <asm/uaccess.h>
273 #include <asm/ioctls.h>
274
275 int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
276
277 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
278
279 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
280
281 EXPORT_SYMBOL_GPL(tcp_orphan_count);
282
283 int sysctl_tcp_mem[3] __read_mostly;
284 int sysctl_tcp_wmem[3] __read_mostly;
285 int sysctl_tcp_rmem[3] __read_mostly;
286
287 EXPORT_SYMBOL(sysctl_tcp_mem);
288 EXPORT_SYMBOL(sysctl_tcp_rmem);
289 EXPORT_SYMBOL(sysctl_tcp_wmem);
290
291 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
292 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
293
294 EXPORT_SYMBOL(tcp_memory_allocated);
295 EXPORT_SYMBOL(tcp_sockets_allocated);
296
297 /*
298  * TCP splice context
299  */
300 struct tcp_splice_state {
301         struct pipe_inode_info *pipe;
302         size_t len;
303         unsigned int flags;
304 };
305
306 /*
307  * Pressure flag: try to collapse.
308  * Technical note: it is used by multiple contexts non atomically.
309  * All the __sk_mem_schedule() is of this nature: accounting
310  * is strict, actions are advisory and have some latency.
311  */
312 int tcp_memory_pressure __read_mostly;
313
314 EXPORT_SYMBOL(tcp_memory_pressure);
315
316 void tcp_enter_memory_pressure(void)
317 {
318         if (!tcp_memory_pressure) {
319                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
320                 tcp_memory_pressure = 1;
321         }
322 }
323
324 EXPORT_SYMBOL(tcp_enter_memory_pressure);
325
326 /*
327  *      Wait for a TCP event.
328  *
329  *      Note that we don't need to lock the socket, as the upper poll layers
330  *      take care of normal races (between the test and the event) and we don't
331  *      go look at any of the socket buffers directly.
332  */
333 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
334 {
335         unsigned int mask;
336         struct sock *sk = sock->sk;
337         struct tcp_sock *tp = tcp_sk(sk);
338
339         poll_wait(file, sk->sk_sleep, wait);
340         if (sk->sk_state == TCP_LISTEN)
341                 return inet_csk_listen_poll(sk);
342
343         /* Socket is not locked. We are protected from async events
344            by poll logic and correct handling of state changes
345            made by another threads is impossible in any case.
346          */
347
348         mask = 0;
349         if (sk->sk_err)
350                 mask = POLLERR;
351
352         /*
353          * POLLHUP is certainly not done right. But poll() doesn't
354          * have a notion of HUP in just one direction, and for a
355          * socket the read side is more interesting.
356          *
357          * Some poll() documentation says that POLLHUP is incompatible
358          * with the POLLOUT/POLLWR flags, so somebody should check this
359          * all. But careful, it tends to be safer to return too many
360          * bits than too few, and you can easily break real applications
361          * if you don't tell them that something has hung up!
362          *
363          * Check-me.
364          *
365          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
366          * our fs/select.c). It means that after we received EOF,
367          * poll always returns immediately, making impossible poll() on write()
368          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
369          * if and only if shutdown has been made in both directions.
370          * Actually, it is interesting to look how Solaris and DUX
371          * solve this dilemma. I would prefer, if PULLHUP were maskable,
372          * then we could set it on SND_SHUTDOWN. BTW examples given
373          * in Stevens' books assume exactly this behaviour, it explains
374          * why PULLHUP is incompatible with POLLOUT.    --ANK
375          *
376          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
377          * blocking on fresh not-connected or disconnected socket. --ANK
378          */
379         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
380                 mask |= POLLHUP;
381         if (sk->sk_shutdown & RCV_SHUTDOWN)
382                 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
383
384         /* Connected? */
385         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
386                 /* Potential race condition. If read of tp below will
387                  * escape above sk->sk_state, we can be illegally awaken
388                  * in SYN_* states. */
389                 if ((tp->rcv_nxt != tp->copied_seq) &&
390                     (tp->urg_seq != tp->copied_seq ||
391                      tp->rcv_nxt != tp->copied_seq + 1 ||
392                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
393                         mask |= POLLIN | POLLRDNORM;
394
395                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
396                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
397                                 mask |= POLLOUT | POLLWRNORM;
398                         } else {  /* send SIGIO later */
399                                 set_bit(SOCK_ASYNC_NOSPACE,
400                                         &sk->sk_socket->flags);
401                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
402
403                                 /* Race breaker. If space is freed after
404                                  * wspace test but before the flags are set,
405                                  * IO signal will be lost.
406                                  */
407                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
408                                         mask |= POLLOUT | POLLWRNORM;
409                         }
410                 }
411
412                 if (tp->urg_data & TCP_URG_VALID)
413                         mask |= POLLPRI;
414         }
415         return mask;
416 }
417
418 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
419 {
420         struct tcp_sock *tp = tcp_sk(sk);
421         int answ;
422
423         switch (cmd) {
424         case SIOCINQ:
425                 if (sk->sk_state == TCP_LISTEN)
426                         return -EINVAL;
427
428                 lock_sock(sk);
429                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
430                         answ = 0;
431                 else if (sock_flag(sk, SOCK_URGINLINE) ||
432                          !tp->urg_data ||
433                          before(tp->urg_seq, tp->copied_seq) ||
434                          !before(tp->urg_seq, tp->rcv_nxt)) {
435                         answ = tp->rcv_nxt - tp->copied_seq;
436
437                         /* Subtract 1, if FIN is in queue. */
438                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
439                                 answ -=
440                        tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin;
441                 } else
442                         answ = tp->urg_seq - tp->copied_seq;
443                 release_sock(sk);
444                 break;
445         case SIOCATMARK:
446                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
447                 break;
448         case SIOCOUTQ:
449                 if (sk->sk_state == TCP_LISTEN)
450                         return -EINVAL;
451
452                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
453                         answ = 0;
454                 else
455                         answ = tp->write_seq - tp->snd_una;
456                 break;
457         default:
458                 return -ENOIOCTLCMD;
459         }
460
461         return put_user(answ, (int __user *)arg);
462 }
463
464 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
465 {
466         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
467         tp->pushed_seq = tp->write_seq;
468 }
469
470 static inline int forced_push(struct tcp_sock *tp)
471 {
472         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
473 }
474
475 static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
476 {
477         struct tcp_sock *tp = tcp_sk(sk);
478         struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
479
480         skb->csum    = 0;
481         tcb->seq     = tcb->end_seq = tp->write_seq;
482         tcb->flags   = TCPCB_FLAG_ACK;
483         tcb->sacked  = 0;
484         skb_header_release(skb);
485         tcp_add_write_queue_tail(sk, skb);
486         sk->sk_wmem_queued += skb->truesize;
487         sk_mem_charge(sk, skb->truesize);
488         if (tp->nonagle & TCP_NAGLE_PUSH)
489                 tp->nonagle &= ~TCP_NAGLE_PUSH;
490 }
491
492 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
493                                 struct sk_buff *skb)
494 {
495         if (flags & MSG_OOB) {
496                 tp->urg_mode = 1;
497                 tp->snd_up = tp->write_seq;
498         }
499 }
500
501 static inline void tcp_push(struct sock *sk, int flags, int mss_now,
502                             int nonagle)
503 {
504         struct tcp_sock *tp = tcp_sk(sk);
505
506         if (tcp_send_head(sk)) {
507                 struct sk_buff *skb = tcp_write_queue_tail(sk);
508                 if (!(flags & MSG_MORE) || forced_push(tp))
509                         tcp_mark_push(tp, skb);
510                 tcp_mark_urg(tp, flags, skb);
511                 __tcp_push_pending_frames(sk, mss_now,
512                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
513         }
514 }
515
516 static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
517                                 unsigned int offset, size_t len)
518 {
519         struct tcp_splice_state *tss = rd_desc->arg.data;
520
521         return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags);
522 }
523
524 static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
525 {
526         /* Store TCP splice context information in read_descriptor_t. */
527         read_descriptor_t rd_desc = {
528                 .arg.data = tss,
529         };
530
531         return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
532 }
533
534 /**
535  *  tcp_splice_read - splice data from TCP socket to a pipe
536  * @sock:       socket to splice from
537  * @ppos:       position (not valid)
538  * @pipe:       pipe to splice to
539  * @len:        number of bytes to splice
540  * @flags:      splice modifier flags
541  *
542  * Description:
543  *    Will read pages from given socket and fill them into a pipe.
544  *
545  **/
546 ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
547                         struct pipe_inode_info *pipe, size_t len,
548                         unsigned int flags)
549 {
550         struct sock *sk = sock->sk;
551         struct tcp_splice_state tss = {
552                 .pipe = pipe,
553                 .len = len,
554                 .flags = flags,
555         };
556         long timeo;
557         ssize_t spliced;
558         int ret;
559
560         /*
561          * We can't seek on a socket input
562          */
563         if (unlikely(*ppos))
564                 return -ESPIPE;
565
566         ret = spliced = 0;
567
568         lock_sock(sk);
569
570         timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK);
571         while (tss.len) {
572                 ret = __tcp_splice_read(sk, &tss);
573                 if (ret < 0)
574                         break;
575                 else if (!ret) {
576                         if (spliced)
577                                 break;
578                         if (flags & SPLICE_F_NONBLOCK) {
579                                 ret = -EAGAIN;
580                                 break;
581                         }
582                         if (sock_flag(sk, SOCK_DONE))
583                                 break;
584                         if (sk->sk_err) {
585                                 ret = sock_error(sk);
586                                 break;
587                         }
588                         if (sk->sk_shutdown & RCV_SHUTDOWN)
589                                 break;
590                         if (sk->sk_state == TCP_CLOSE) {
591                                 /*
592                                  * This occurs when user tries to read
593                                  * from never connected socket.
594                                  */
595                                 if (!sock_flag(sk, SOCK_DONE))
596                                         ret = -ENOTCONN;
597                                 break;
598                         }
599                         if (!timeo) {
600                                 ret = -EAGAIN;
601                                 break;
602                         }
603                         sk_wait_data(sk, &timeo);
604                         if (signal_pending(current)) {
605                                 ret = sock_intr_errno(timeo);
606                                 break;
607                         }
608                         continue;
609                 }
610                 tss.len -= ret;
611                 spliced += ret;
612
613                 release_sock(sk);
614                 lock_sock(sk);
615
616                 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
617                     (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo ||
618                     signal_pending(current))
619                         break;
620         }
621
622         release_sock(sk);
623
624         if (spliced)
625                 return spliced;
626
627         return ret;
628 }
629
630 struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
631 {
632         struct sk_buff *skb;
633
634         /* The TCP header must be at least 32-bit aligned.  */
635         size = ALIGN(size, 4);
636
637         skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
638         if (skb) {
639                 if (sk_wmem_schedule(sk, skb->truesize)) {
640                         /*
641                          * Make sure that we have exactly size bytes
642                          * available to the caller, no more, no less.
643                          */
644                         skb_reserve(skb, skb_tailroom(skb) - size);
645                         return skb;
646                 }
647                 __kfree_skb(skb);
648         } else {
649                 sk->sk_prot->enter_memory_pressure();
650                 sk_stream_moderate_sndbuf(sk);
651         }
652         return NULL;
653 }
654
655 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
656                          size_t psize, int flags)
657 {
658         struct tcp_sock *tp = tcp_sk(sk);
659         int mss_now, size_goal;
660         int err;
661         ssize_t copied;
662         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
663
664         /* Wait for a connection to finish. */
665         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
666                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
667                         goto out_err;
668
669         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
670
671         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
672         size_goal = tp->xmit_size_goal;
673         copied = 0;
674
675         err = -EPIPE;
676         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
677                 goto do_error;
678
679         while (psize > 0) {
680                 struct sk_buff *skb = tcp_write_queue_tail(sk);
681                 struct page *page = pages[poffset / PAGE_SIZE];
682                 int copy, i, can_coalesce;
683                 int offset = poffset % PAGE_SIZE;
684                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
685
686                 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
687 new_segment:
688                         if (!sk_stream_memory_free(sk))
689                                 goto wait_for_sndbuf;
690
691                         skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
692                         if (!skb)
693                                 goto wait_for_memory;
694
695                         skb_entail(sk, skb);
696                         copy = size_goal;
697                 }
698
699                 if (copy > size)
700                         copy = size;
701
702                 i = skb_shinfo(skb)->nr_frags;
703                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
704                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
705                         tcp_mark_push(tp, skb);
706                         goto new_segment;
707                 }
708                 if (!sk_wmem_schedule(sk, copy))
709                         goto wait_for_memory;
710
711                 if (can_coalesce) {
712                         skb_shinfo(skb)->frags[i - 1].size += copy;
713                 } else {
714                         get_page(page);
715                         skb_fill_page_desc(skb, i, page, offset, copy);
716                 }
717
718                 skb->len += copy;
719                 skb->data_len += copy;
720                 skb->truesize += copy;
721                 sk->sk_wmem_queued += copy;
722                 sk_mem_charge(sk, copy);
723                 skb->ip_summed = CHECKSUM_PARTIAL;
724                 tp->write_seq += copy;
725                 TCP_SKB_CB(skb)->end_seq += copy;
726                 skb_shinfo(skb)->gso_segs = 0;
727
728                 if (!copied)
729                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
730
731                 copied += copy;
732                 poffset += copy;
733                 if (!(psize -= copy))
734                         goto out;
735
736                 if (skb->len < size_goal || (flags & MSG_OOB))
737                         continue;
738
739                 if (forced_push(tp)) {
740                         tcp_mark_push(tp, skb);
741                         __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
742                 } else if (skb == tcp_send_head(sk))
743                         tcp_push_one(sk, mss_now);
744                 continue;
745
746 wait_for_sndbuf:
747                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
748 wait_for_memory:
749                 if (copied)
750                         tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
751
752                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
753                         goto do_error;
754
755                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
756                 size_goal = tp->xmit_size_goal;
757         }
758
759 out:
760         if (copied)
761                 tcp_push(sk, flags, mss_now, tp->nonagle);
762         return copied;
763
764 do_error:
765         if (copied)
766                 goto out;
767 out_err:
768         return sk_stream_error(sk, flags, err);
769 }
770
771 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
772                      size_t size, int flags)
773 {
774         ssize_t res;
775         struct sock *sk = sock->sk;
776
777         if (!(sk->sk_route_caps & NETIF_F_SG) ||
778             !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
779                 return sock_no_sendpage(sock, page, offset, size, flags);
780
781         lock_sock(sk);
782         TCP_CHECK_TIMER(sk);
783         res = do_tcp_sendpages(sk, &page, offset, size, flags);
784         TCP_CHECK_TIMER(sk);
785         release_sock(sk);
786         return res;
787 }
788
789 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
790 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
791
792 static inline int select_size(struct sock *sk)
793 {
794         struct tcp_sock *tp = tcp_sk(sk);
795         int tmp = tp->mss_cache;
796
797         if (sk->sk_route_caps & NETIF_F_SG) {
798                 if (sk_can_gso(sk))
799                         tmp = 0;
800                 else {
801                         int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
802
803                         if (tmp >= pgbreak &&
804                             tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
805                                 tmp = pgbreak;
806                 }
807         }
808
809         return tmp;
810 }
811
812 int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
813                 size_t size)
814 {
815         struct sock *sk = sock->sk;
816         struct iovec *iov;
817         struct tcp_sock *tp = tcp_sk(sk);
818         struct sk_buff *skb;
819         int iovlen, flags;
820         int mss_now, size_goal;
821         int err, copied;
822         long timeo;
823
824         lock_sock(sk);
825         TCP_CHECK_TIMER(sk);
826
827         flags = msg->msg_flags;
828         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
829
830         /* Wait for a connection to finish. */
831         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
832                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
833                         goto out_err;
834
835         /* This should be in poll */
836         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
837
838         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
839         size_goal = tp->xmit_size_goal;
840
841         /* Ok commence sending. */
842         iovlen = msg->msg_iovlen;
843         iov = msg->msg_iov;
844         copied = 0;
845
846         err = -EPIPE;
847         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
848                 goto do_error;
849
850         while (--iovlen >= 0) {
851                 int seglen = iov->iov_len;
852                 unsigned char __user *from = iov->iov_base;
853
854                 iov++;
855
856                 while (seglen > 0) {
857                         int copy;
858
859                         skb = tcp_write_queue_tail(sk);
860
861                         if (!tcp_send_head(sk) ||
862                             (copy = size_goal - skb->len) <= 0) {
863
864 new_segment:
865                                 /* Allocate new segment. If the interface is SG,
866                                  * allocate skb fitting to single page.
867                                  */
868                                 if (!sk_stream_memory_free(sk))
869                                         goto wait_for_sndbuf;
870
871                                 skb = sk_stream_alloc_skb(sk, select_size(sk),
872                                                 sk->sk_allocation);
873                                 if (!skb)
874                                         goto wait_for_memory;
875
876                                 /*
877                                  * Check whether we can use HW checksum.
878                                  */
879                                 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
880                                         skb->ip_summed = CHECKSUM_PARTIAL;
881
882                                 skb_entail(sk, skb);
883                                 copy = size_goal;
884                         }
885
886                         /* Try to append data to the end of skb. */
887                         if (copy > seglen)
888                                 copy = seglen;
889
890                         /* Where to copy to? */
891                         if (skb_tailroom(skb) > 0) {
892                                 /* We have some space in skb head. Superb! */
893                                 if (copy > skb_tailroom(skb))
894                                         copy = skb_tailroom(skb);
895                                 if ((err = skb_add_data(skb, from, copy)) != 0)
896                                         goto do_fault;
897                         } else {
898                                 int merge = 0;
899                                 int i = skb_shinfo(skb)->nr_frags;
900                                 struct page *page = TCP_PAGE(sk);
901                                 int off = TCP_OFF(sk);
902
903                                 if (skb_can_coalesce(skb, i, page, off) &&
904                                     off != PAGE_SIZE) {
905                                         /* We can extend the last page
906                                          * fragment. */
907                                         merge = 1;
908                                 } else if (i == MAX_SKB_FRAGS ||
909                                            (!i &&
910                                            !(sk->sk_route_caps & NETIF_F_SG))) {
911                                         /* Need to add new fragment and cannot
912                                          * do this because interface is non-SG,
913                                          * or because all the page slots are
914                                          * busy. */
915                                         tcp_mark_push(tp, skb);
916                                         goto new_segment;
917                                 } else if (page) {
918                                         if (off == PAGE_SIZE) {
919                                                 put_page(page);
920                                                 TCP_PAGE(sk) = page = NULL;
921                                                 off = 0;
922                                         }
923                                 } else
924                                         off = 0;
925
926                                 if (copy > PAGE_SIZE - off)
927                                         copy = PAGE_SIZE - off;
928
929                                 if (!sk_wmem_schedule(sk, copy))
930                                         goto wait_for_memory;
931
932                                 if (!page) {
933                                         /* Allocate new cache page. */
934                                         if (!(page = sk_stream_alloc_page(sk)))
935                                                 goto wait_for_memory;
936                                 }
937
938                                 /* Time to copy data. We are close to
939                                  * the end! */
940                                 err = skb_copy_to_page(sk, from, skb, page,
941                                                        off, copy);
942                                 if (err) {
943                                         /* If this page was new, give it to the
944                                          * socket so it does not get leaked.
945                                          */
946                                         if (!TCP_PAGE(sk)) {
947                                                 TCP_PAGE(sk) = page;
948                                                 TCP_OFF(sk) = 0;
949                                         }
950                                         goto do_error;
951                                 }
952
953                                 /* Update the skb. */
954                                 if (merge) {
955                                         skb_shinfo(skb)->frags[i - 1].size +=
956                                                                         copy;
957                                 } else {
958                                         skb_fill_page_desc(skb, i, page, off, copy);
959                                         if (TCP_PAGE(sk)) {
960                                                 get_page(page);
961                                         } else if (off + copy < PAGE_SIZE) {
962                                                 get_page(page);
963                                                 TCP_PAGE(sk) = page;
964                                         }
965                                 }
966
967                                 TCP_OFF(sk) = off + copy;
968                         }
969
970                         if (!copied)
971                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
972
973                         tp->write_seq += copy;
974                         TCP_SKB_CB(skb)->end_seq += copy;
975                         skb_shinfo(skb)->gso_segs = 0;
976
977                         from += copy;
978                         copied += copy;
979                         if ((seglen -= copy) == 0 && iovlen == 0)
980                                 goto out;
981
982                         if (skb->len < size_goal || (flags & MSG_OOB))
983                                 continue;
984
985                         if (forced_push(tp)) {
986                                 tcp_mark_push(tp, skb);
987                                 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
988                         } else if (skb == tcp_send_head(sk))
989                                 tcp_push_one(sk, mss_now);
990                         continue;
991
992 wait_for_sndbuf:
993                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
994 wait_for_memory:
995                         if (copied)
996                                 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
997
998                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
999                                 goto do_error;
1000
1001                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1002                         size_goal = tp->xmit_size_goal;
1003                 }
1004         }
1005
1006 out:
1007         if (copied)
1008                 tcp_push(sk, flags, mss_now, tp->nonagle);
1009         TCP_CHECK_TIMER(sk);
1010         release_sock(sk);
1011         return copied;
1012
1013 do_fault:
1014         if (!skb->len) {
1015                 tcp_unlink_write_queue(skb, sk);
1016                 /* It is the one place in all of TCP, except connection
1017                  * reset, where we can be unlinking the send_head.
1018                  */
1019                 tcp_check_send_head(sk, skb);
1020                 sk_wmem_free_skb(sk, skb);
1021         }
1022
1023 do_error:
1024         if (copied)
1025                 goto out;
1026 out_err:
1027         err = sk_stream_error(sk, flags, err);
1028         TCP_CHECK_TIMER(sk);
1029         release_sock(sk);
1030         return err;
1031 }
1032
1033 /*
1034  *      Handle reading urgent data. BSD has very simple semantics for
1035  *      this, no blocking and very strange errors 8)
1036  */
1037
1038 static int tcp_recv_urg(struct sock *sk, long timeo,
1039                         struct msghdr *msg, int len, int flags,
1040                         int *addr_len)
1041 {
1042         struct tcp_sock *tp = tcp_sk(sk);
1043
1044         /* No URG data to read. */
1045         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1046             tp->urg_data == TCP_URG_READ)
1047                 return -EINVAL; /* Yes this is right ! */
1048
1049         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1050                 return -ENOTCONN;
1051
1052         if (tp->urg_data & TCP_URG_VALID) {
1053                 int err = 0;
1054                 char c = tp->urg_data;
1055
1056                 if (!(flags & MSG_PEEK))
1057                         tp->urg_data = TCP_URG_READ;
1058
1059                 /* Read urgent data. */
1060                 msg->msg_flags |= MSG_OOB;
1061
1062                 if (len > 0) {
1063                         if (!(flags & MSG_TRUNC))
1064                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1065                         len = 1;
1066                 } else
1067                         msg->msg_flags |= MSG_TRUNC;
1068
1069                 return err ? -EFAULT : len;
1070         }
1071
1072         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1073                 return 0;
1074
1075         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1076          * the available implementations agree in this case:
1077          * this call should never block, independent of the
1078          * blocking state of the socket.
1079          * Mike <pall@rz.uni-karlsruhe.de>
1080          */
1081         return -EAGAIN;
1082 }
1083
1084 /* Clean up the receive buffer for full frames taken by the user,
1085  * then send an ACK if necessary.  COPIED is the number of bytes
1086  * tcp_recvmsg has given to the user so far, it speeds up the
1087  * calculation of whether or not we must ACK for the sake of
1088  * a window update.
1089  */
1090 void tcp_cleanup_rbuf(struct sock *sk, int copied)
1091 {
1092         struct tcp_sock *tp = tcp_sk(sk);
1093         int time_to_ack = 0;
1094
1095 #if TCP_DEBUG
1096         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1097
1098         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1099 #endif
1100
1101         if (inet_csk_ack_scheduled(sk)) {
1102                 const struct inet_connection_sock *icsk = inet_csk(sk);
1103                    /* Delayed ACKs frequently hit locked sockets during bulk
1104                     * receive. */
1105                 if (icsk->icsk_ack.blocked ||
1106                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1107                     tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1108                     /*
1109                      * If this read emptied read buffer, we send ACK, if
1110                      * connection is not bidirectional, user drained
1111                      * receive buffer and there was a small segment
1112                      * in queue.
1113                      */
1114                     (copied > 0 &&
1115                      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1116                       ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1117                        !icsk->icsk_ack.pingpong)) &&
1118                       !atomic_read(&sk->sk_rmem_alloc)))
1119                         time_to_ack = 1;
1120         }
1121
1122         /* We send an ACK if we can now advertise a non-zero window
1123          * which has been raised "significantly".
1124          *
1125          * Even if window raised up to infinity, do not send window open ACK
1126          * in states, where we will not receive more. It is useless.
1127          */
1128         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1129                 __u32 rcv_window_now = tcp_receive_window(tp);
1130
1131                 /* Optimize, __tcp_select_window() is not cheap. */
1132                 if (2*rcv_window_now <= tp->window_clamp) {
1133                         __u32 new_window = __tcp_select_window(sk);
1134
1135                         /* Send ACK now, if this read freed lots of space
1136                          * in our buffer. Certainly, new_window is new window.
1137                          * We can advertise it now, if it is not less than current one.
1138                          * "Lots" means "at least twice" here.
1139                          */
1140                         if (new_window && new_window >= 2 * rcv_window_now)
1141                                 time_to_ack = 1;
1142                 }
1143         }
1144         if (time_to_ack)
1145                 tcp_send_ack(sk);
1146 }
1147
1148 static void tcp_prequeue_process(struct sock *sk)
1149 {
1150         struct sk_buff *skb;
1151         struct tcp_sock *tp = tcp_sk(sk);
1152
1153         NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1154
1155         /* RX process wants to run with disabled BHs, though it is not
1156          * necessary */
1157         local_bh_disable();
1158         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1159                 sk->sk_backlog_rcv(sk, skb);
1160         local_bh_enable();
1161
1162         /* Clear memory counter. */
1163         tp->ucopy.memory = 0;
1164 }
1165
1166 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1167 {
1168         struct sk_buff *skb;
1169         u32 offset;
1170
1171         skb_queue_walk(&sk->sk_receive_queue, skb) {
1172                 offset = seq - TCP_SKB_CB(skb)->seq;
1173                 if (tcp_hdr(skb)->syn)
1174                         offset--;
1175                 if (offset < skb->len || tcp_hdr(skb)->fin) {
1176                         *off = offset;
1177                         return skb;
1178                 }
1179         }
1180         return NULL;
1181 }
1182
1183 /*
1184  * This routine provides an alternative to tcp_recvmsg() for routines
1185  * that would like to handle copying from skbuffs directly in 'sendfile'
1186  * fashion.
1187  * Note:
1188  *      - It is assumed that the socket was locked by the caller.
1189  *      - The routine does not block.
1190  *      - At present, there is no support for reading OOB data
1191  *        or for 'peeking' the socket using this routine
1192  *        (although both would be easy to implement).
1193  */
1194 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1195                   sk_read_actor_t recv_actor)
1196 {
1197         struct sk_buff *skb;
1198         struct tcp_sock *tp = tcp_sk(sk);
1199         u32 seq = tp->copied_seq;
1200         u32 offset;
1201         int copied = 0;
1202
1203         if (sk->sk_state == TCP_LISTEN)
1204                 return -ENOTCONN;
1205         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1206                 if (offset < skb->len) {
1207                         size_t used, len;
1208
1209                         len = skb->len - offset;
1210                         /* Stop reading if we hit a patch of urgent data */
1211                         if (tp->urg_data) {
1212                                 u32 urg_offset = tp->urg_seq - seq;
1213                                 if (urg_offset < len)
1214                                         len = urg_offset;
1215                                 if (!len)
1216                                         break;
1217                         }
1218                         used = recv_actor(desc, skb, offset, len);
1219                         if (used < 0) {
1220                                 if (!copied)
1221                                         copied = used;
1222                                 break;
1223                         } else if (used <= len) {
1224                                 seq += used;
1225                                 copied += used;
1226                                 offset += used;
1227                         }
1228                         /*
1229                          * If recv_actor drops the lock (e.g. TCP splice
1230                          * receive) the skb pointer might be invalid when
1231                          * getting here: tcp_collapse might have deleted it
1232                          * while aggregating skbs from the socket queue.
1233                          */
1234                         skb = tcp_recv_skb(sk, seq-1, &offset);
1235                         if (!skb || (offset+1 != skb->len))
1236                                 break;
1237                 }
1238                 if (tcp_hdr(skb)->fin) {
1239                         sk_eat_skb(sk, skb, 0);
1240                         ++seq;
1241                         break;
1242                 }
1243                 sk_eat_skb(sk, skb, 0);
1244                 if (!desc->count)
1245                         break;
1246         }
1247         tp->copied_seq = seq;
1248
1249         tcp_rcv_space_adjust(sk);
1250
1251         /* Clean up data we have read: This will do ACK frames. */
1252         if (copied > 0)
1253                 tcp_cleanup_rbuf(sk, copied);
1254         return copied;
1255 }
1256
1257 /*
1258  *      This routine copies from a sock struct into the user buffer.
1259  *
1260  *      Technical note: in 2.3 we work on _locked_ socket, so that
1261  *      tricks with *seq access order and skb->users are not required.
1262  *      Probably, code can be easily improved even more.
1263  */
1264
1265 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1266                 size_t len, int nonblock, int flags, int *addr_len)
1267 {
1268         struct tcp_sock *tp = tcp_sk(sk);
1269         int copied = 0;
1270         u32 peek_seq;
1271         u32 *seq;
1272         unsigned long used;
1273         int err;
1274         int target;             /* Read at least this many bytes */
1275         long timeo;
1276         struct task_struct *user_recv = NULL;
1277         int copied_early = 0;
1278         struct sk_buff *skb;
1279
1280         lock_sock(sk);
1281
1282         TCP_CHECK_TIMER(sk);
1283
1284         err = -ENOTCONN;
1285         if (sk->sk_state == TCP_LISTEN)
1286                 goto out;
1287
1288         timeo = sock_rcvtimeo(sk, nonblock);
1289
1290         /* Urgent data needs to be handled specially. */
1291         if (flags & MSG_OOB)
1292                 goto recv_urg;
1293
1294         seq = &tp->copied_seq;
1295         if (flags & MSG_PEEK) {
1296                 peek_seq = tp->copied_seq;
1297                 seq = &peek_seq;
1298         }
1299
1300         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1301
1302 #ifdef CONFIG_NET_DMA
1303         tp->ucopy.dma_chan = NULL;
1304         preempt_disable();
1305         skb = skb_peek_tail(&sk->sk_receive_queue);
1306         {
1307                 int available = 0;
1308
1309                 if (skb)
1310                         available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1311                 if ((available < target) &&
1312                     (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1313                     !sysctl_tcp_low_latency &&
1314                     __get_cpu_var(softnet_data).net_dma) {
1315                         preempt_enable_no_resched();
1316                         tp->ucopy.pinned_list =
1317                                         dma_pin_iovec_pages(msg->msg_iov, len);
1318                 } else {
1319                         preempt_enable_no_resched();
1320                 }
1321         }
1322 #endif
1323
1324         do {
1325                 u32 offset;
1326
1327                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1328                 if (tp->urg_data && tp->urg_seq == *seq) {
1329                         if (copied)
1330                                 break;
1331                         if (signal_pending(current)) {
1332                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1333                                 break;
1334                         }
1335                 }
1336
1337                 /* Next get a buffer. */
1338
1339                 skb = skb_peek(&sk->sk_receive_queue);
1340                 do {
1341                         if (!skb)
1342                                 break;
1343
1344                         /* Now that we have two receive queues this
1345                          * shouldn't happen.
1346                          */
1347                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1348                                 printk(KERN_INFO "recvmsg bug: copied %X "
1349                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1350                                 break;
1351                         }
1352                         offset = *seq - TCP_SKB_CB(skb)->seq;
1353                         if (tcp_hdr(skb)->syn)
1354                                 offset--;
1355                         if (offset < skb->len)
1356                                 goto found_ok_skb;
1357                         if (tcp_hdr(skb)->fin)
1358                                 goto found_fin_ok;
1359                         BUG_TRAP(flags & MSG_PEEK);
1360                         skb = skb->next;
1361                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1362
1363                 /* Well, if we have backlog, try to process it now yet. */
1364
1365                 if (copied >= target && !sk->sk_backlog.tail)
1366                         break;
1367
1368                 if (copied) {
1369                         if (sk->sk_err ||
1370                             sk->sk_state == TCP_CLOSE ||
1371                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1372                             !timeo ||
1373                             signal_pending(current) ||
1374                             (flags & MSG_PEEK))
1375                                 break;
1376                 } else {
1377                         if (sock_flag(sk, SOCK_DONE))
1378                                 break;
1379
1380                         if (sk->sk_err) {
1381                                 copied = sock_error(sk);
1382                                 break;
1383                         }
1384
1385                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1386                                 break;
1387
1388                         if (sk->sk_state == TCP_CLOSE) {
1389                                 if (!sock_flag(sk, SOCK_DONE)) {
1390                                         /* This occurs when user tries to read
1391                                          * from never connected socket.
1392                                          */
1393                                         copied = -ENOTCONN;
1394                                         break;
1395                                 }
1396                                 break;
1397                         }
1398
1399                         if (!timeo) {
1400                                 copied = -EAGAIN;
1401                                 break;
1402                         }
1403
1404                         if (signal_pending(current)) {
1405                                 copied = sock_intr_errno(timeo);
1406                                 break;
1407                         }
1408                 }
1409
1410                 tcp_cleanup_rbuf(sk, copied);
1411
1412                 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1413                         /* Install new reader */
1414                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1415                                 user_recv = current;
1416                                 tp->ucopy.task = user_recv;
1417                                 tp->ucopy.iov = msg->msg_iov;
1418                         }
1419
1420                         tp->ucopy.len = len;
1421
1422                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1423                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1424
1425                         /* Ugly... If prequeue is not empty, we have to
1426                          * process it before releasing socket, otherwise
1427                          * order will be broken at second iteration.
1428                          * More elegant solution is required!!!
1429                          *
1430                          * Look: we have the following (pseudo)queues:
1431                          *
1432                          * 1. packets in flight
1433                          * 2. backlog
1434                          * 3. prequeue
1435                          * 4. receive_queue
1436                          *
1437                          * Each queue can be processed only if the next ones
1438                          * are empty. At this point we have empty receive_queue.
1439                          * But prequeue _can_ be not empty after 2nd iteration,
1440                          * when we jumped to start of loop because backlog
1441                          * processing added something to receive_queue.
1442                          * We cannot release_sock(), because backlog contains
1443                          * packets arrived _after_ prequeued ones.
1444                          *
1445                          * Shortly, algorithm is clear --- to process all
1446                          * the queues in order. We could make it more directly,
1447                          * requeueing packets from backlog to prequeue, if
1448                          * is not empty. It is more elegant, but eats cycles,
1449                          * unfortunately.
1450                          */
1451                         if (!skb_queue_empty(&tp->ucopy.prequeue))
1452                                 goto do_prequeue;
1453
1454                         /* __ Set realtime policy in scheduler __ */
1455                 }
1456
1457                 if (copied >= target) {
1458                         /* Do not sleep, just process backlog. */
1459                         release_sock(sk);
1460                         lock_sock(sk);
1461                 } else
1462                         sk_wait_data(sk, &timeo);
1463
1464 #ifdef CONFIG_NET_DMA
1465                 tp->ucopy.wakeup = 0;
1466 #endif
1467
1468                 if (user_recv) {
1469                         int chunk;
1470
1471                         /* __ Restore normal policy in scheduler __ */
1472
1473                         if ((chunk = len - tp->ucopy.len) != 0) {
1474                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1475                                 len -= chunk;
1476                                 copied += chunk;
1477                         }
1478
1479                         if (tp->rcv_nxt == tp->copied_seq &&
1480                             !skb_queue_empty(&tp->ucopy.prequeue)) {
1481 do_prequeue:
1482                                 tcp_prequeue_process(sk);
1483
1484                                 if ((chunk = len - tp->ucopy.len) != 0) {
1485                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1486                                         len -= chunk;
1487                                         copied += chunk;
1488                                 }
1489                         }
1490                 }
1491                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1492                         if (net_ratelimit())
1493                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1494                                        current->comm, task_pid_nr(current));
1495                         peek_seq = tp->copied_seq;
1496                 }
1497                 continue;
1498
1499         found_ok_skb:
1500                 /* Ok so how much can we use? */
1501                 used = skb->len - offset;
1502                 if (len < used)
1503                         used = len;
1504
1505                 /* Do we have urgent data here? */
1506                 if (tp->urg_data) {
1507                         u32 urg_offset = tp->urg_seq - *seq;
1508                         if (urg_offset < used) {
1509                                 if (!urg_offset) {
1510                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1511                                                 ++*seq;
1512                                                 offset++;
1513                                                 used--;
1514                                                 if (!used)
1515                                                         goto skip_copy;
1516                                         }
1517                                 } else
1518                                         used = urg_offset;
1519                         }
1520                 }
1521
1522                 if (!(flags & MSG_TRUNC)) {
1523 #ifdef CONFIG_NET_DMA
1524                         if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1525                                 tp->ucopy.dma_chan = get_softnet_dma();
1526
1527                         if (tp->ucopy.dma_chan) {
1528                                 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1529                                         tp->ucopy.dma_chan, skb, offset,
1530                                         msg->msg_iov, used,
1531                                         tp->ucopy.pinned_list);
1532
1533                                 if (tp->ucopy.dma_cookie < 0) {
1534
1535                                         printk(KERN_ALERT "dma_cookie < 0\n");
1536
1537                                         /* Exception. Bailout! */
1538                                         if (!copied)
1539                                                 copied = -EFAULT;
1540                                         break;
1541                                 }
1542                                 if ((offset + used) == skb->len)
1543                                         copied_early = 1;
1544
1545                         } else
1546 #endif
1547                         {
1548                                 err = skb_copy_datagram_iovec(skb, offset,
1549                                                 msg->msg_iov, used);
1550                                 if (err) {
1551                                         /* Exception. Bailout! */
1552                                         if (!copied)
1553                                                 copied = -EFAULT;
1554                                         break;
1555                                 }
1556                         }
1557                 }
1558
1559                 *seq += used;
1560                 copied += used;
1561                 len -= used;
1562
1563                 tcp_rcv_space_adjust(sk);
1564
1565 skip_copy:
1566                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1567                         tp->urg_data = 0;
1568                         tcp_fast_path_check(sk);
1569                 }
1570                 if (used + offset < skb->len)
1571                         continue;
1572
1573                 if (tcp_hdr(skb)->fin)
1574                         goto found_fin_ok;
1575                 if (!(flags & MSG_PEEK)) {
1576                         sk_eat_skb(sk, skb, copied_early);
1577                         copied_early = 0;
1578                 }
1579                 continue;
1580
1581         found_fin_ok:
1582                 /* Process the FIN. */
1583                 ++*seq;
1584                 if (!(flags & MSG_PEEK)) {
1585                         sk_eat_skb(sk, skb, copied_early);
1586                         copied_early = 0;
1587                 }
1588                 break;
1589         } while (len > 0);
1590
1591         if (user_recv) {
1592                 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1593                         int chunk;
1594
1595                         tp->ucopy.len = copied > 0 ? len : 0;
1596
1597                         tcp_prequeue_process(sk);
1598
1599                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1600                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1601                                 len -= chunk;
1602                                 copied += chunk;
1603                         }
1604                 }
1605
1606                 tp->ucopy.task = NULL;
1607                 tp->ucopy.len = 0;
1608         }
1609
1610 #ifdef CONFIG_NET_DMA
1611         if (tp->ucopy.dma_chan) {
1612                 dma_cookie_t done, used;
1613
1614                 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1615
1616                 while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1617                                                  tp->ucopy.dma_cookie, &done,
1618                                                  &used) == DMA_IN_PROGRESS) {
1619                         /* do partial cleanup of sk_async_wait_queue */
1620                         while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1621                                (dma_async_is_complete(skb->dma_cookie, done,
1622                                                       used) == DMA_SUCCESS)) {
1623                                 __skb_dequeue(&sk->sk_async_wait_queue);
1624                                 kfree_skb(skb);
1625                         }
1626                 }
1627
1628                 /* Safe to free early-copied skbs now */
1629                 __skb_queue_purge(&sk->sk_async_wait_queue);
1630                 dma_chan_put(tp->ucopy.dma_chan);
1631                 tp->ucopy.dma_chan = NULL;
1632         }
1633         if (tp->ucopy.pinned_list) {
1634                 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1635                 tp->ucopy.pinned_list = NULL;
1636         }
1637 #endif
1638
1639         /* According to UNIX98, msg_name/msg_namelen are ignored
1640          * on connected socket. I was just happy when found this 8) --ANK
1641          */
1642
1643         /* Clean up data we have read: This will do ACK frames. */
1644         tcp_cleanup_rbuf(sk, copied);
1645
1646         TCP_CHECK_TIMER(sk);
1647         release_sock(sk);
1648         return copied;
1649
1650 out:
1651         TCP_CHECK_TIMER(sk);
1652         release_sock(sk);
1653         return err;
1654
1655 recv_urg:
1656         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1657         goto out;
1658 }
1659
1660 void tcp_set_state(struct sock *sk, int state)
1661 {
1662         int oldstate = sk->sk_state;
1663
1664         switch (state) {
1665         case TCP_ESTABLISHED:
1666                 if (oldstate != TCP_ESTABLISHED)
1667                         TCP_INC_STATS(TCP_MIB_CURRESTAB);
1668                 break;
1669
1670         case TCP_CLOSE:
1671                 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1672                         TCP_INC_STATS(TCP_MIB_ESTABRESETS);
1673
1674                 sk->sk_prot->unhash(sk);
1675                 if (inet_csk(sk)->icsk_bind_hash &&
1676                     !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1677                         inet_put_port(sk);
1678                 /* fall through */
1679         default:
1680                 if (oldstate==TCP_ESTABLISHED)
1681                         TCP_DEC_STATS(TCP_MIB_CURRESTAB);
1682         }
1683
1684         /* Change state AFTER socket is unhashed to avoid closed
1685          * socket sitting in hash tables.
1686          */
1687         sk->sk_state = state;
1688
1689 #ifdef STATE_TRACE
1690         SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]);
1691 #endif
1692 }
1693 EXPORT_SYMBOL_GPL(tcp_set_state);
1694
1695 /*
1696  *      State processing on a close. This implements the state shift for
1697  *      sending our FIN frame. Note that we only send a FIN for some
1698  *      states. A shutdown() may have already sent the FIN, or we may be
1699  *      closed.
1700  */
1701
1702 static const unsigned char new_state[16] = {
1703   /* current state:        new state:      action:      */
1704   /* (Invalid)          */ TCP_CLOSE,
1705   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1706   /* TCP_SYN_SENT       */ TCP_CLOSE,
1707   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1708   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1709   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1710   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1711   /* TCP_CLOSE          */ TCP_CLOSE,
1712   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1713   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1714   /* TCP_LISTEN         */ TCP_CLOSE,
1715   /* TCP_CLOSING        */ TCP_CLOSING,
1716 };
1717
1718 static int tcp_close_state(struct sock *sk)
1719 {
1720         int next = (int)new_state[sk->sk_state];
1721         int ns = next & TCP_STATE_MASK;
1722
1723         tcp_set_state(sk, ns);
1724
1725         return next & TCP_ACTION_FIN;
1726 }
1727
1728 /*
1729  *      Shutdown the sending side of a connection. Much like close except
1730  *      that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
1731  */
1732
1733 void tcp_shutdown(struct sock *sk, int how)
1734 {
1735         /*      We need to grab some memory, and put together a FIN,
1736          *      and then put it into the queue to be sent.
1737          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1738          */
1739         if (!(how & SEND_SHUTDOWN))
1740                 return;
1741
1742         /* If we've already sent a FIN, or it's a closed state, skip this. */
1743         if ((1 << sk->sk_state) &
1744             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1745              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1746                 /* Clear out any half completed packets.  FIN if needed. */
1747                 if (tcp_close_state(sk))
1748                         tcp_send_fin(sk);
1749         }
1750 }
1751
1752 void tcp_close(struct sock *sk, long timeout)
1753 {
1754         struct sk_buff *skb;
1755         int data_was_unread = 0;
1756         int state;
1757
1758         lock_sock(sk);
1759         sk->sk_shutdown = SHUTDOWN_MASK;
1760
1761         if (sk->sk_state == TCP_LISTEN) {
1762                 tcp_set_state(sk, TCP_CLOSE);
1763
1764                 /* Special case. */
1765                 inet_csk_listen_stop(sk);
1766
1767                 goto adjudge_to_death;
1768         }
1769
1770         /*  We need to flush the recv. buffs.  We do this only on the
1771          *  descriptor close, not protocol-sourced closes, because the
1772          *  reader process may not have drained the data yet!
1773          */
1774         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1775                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1776                           tcp_hdr(skb)->fin;
1777                 data_was_unread += len;
1778                 __kfree_skb(skb);
1779         }
1780
1781         sk_mem_reclaim(sk);
1782
1783         /* As outlined in RFC 2525, section 2.17, we send a RST here because
1784          * data was lost. To witness the awful effects of the old behavior of
1785          * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
1786          * GET in an FTP client, suspend the process, wait for the client to
1787          * advertise a zero window, then kill -9 the FTP client, wheee...
1788          * Note: timeout is always zero in such a case.
1789          */
1790         if (data_was_unread) {
1791                 /* Unread data was tossed, zap the connection. */
1792                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1793                 tcp_set_state(sk, TCP_CLOSE);
1794                 tcp_send_active_reset(sk, GFP_KERNEL);
1795         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1796                 /* Check zero linger _after_ checking for unread data. */
1797                 sk->sk_prot->disconnect(sk, 0);
1798                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1799         } else if (tcp_close_state(sk)) {
1800                 /* We FIN if the application ate all the data before
1801                  * zapping the connection.
1802                  */
1803
1804                 /* RED-PEN. Formally speaking, we have broken TCP state
1805                  * machine. State transitions:
1806                  *
1807                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1808                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1809                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1810                  *
1811                  * are legal only when FIN has been sent (i.e. in window),
1812                  * rather than queued out of window. Purists blame.
1813                  *
1814                  * F.e. "RFC state" is ESTABLISHED,
1815                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1816                  *
1817                  * The visible declinations are that sometimes
1818                  * we enter time-wait state, when it is not required really
1819                  * (harmless), do not send active resets, when they are
1820                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1821                  * they look as CLOSING or LAST_ACK for Linux)
1822                  * Probably, I missed some more holelets.
1823                  *                                              --ANK
1824                  */
1825                 tcp_send_fin(sk);
1826         }
1827
1828         sk_stream_wait_close(sk, timeout);
1829
1830 adjudge_to_death:
1831         state = sk->sk_state;
1832         sock_hold(sk);
1833         sock_orphan(sk);
1834         atomic_inc(sk->sk_prot->orphan_count);
1835
1836         /* It is the last release_sock in its life. It will remove backlog. */
1837         release_sock(sk);
1838
1839
1840         /* Now socket is owned by kernel and we acquire BH lock
1841            to finish close. No need to check for user refs.
1842          */
1843         local_bh_disable();
1844         bh_lock_sock(sk);
1845         BUG_TRAP(!sock_owned_by_user(sk));
1846
1847         /* Have we already been destroyed by a softirq or backlog? */
1848         if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
1849                 goto out;
1850
1851         /*      This is a (useful) BSD violating of the RFC. There is a
1852          *      problem with TCP as specified in that the other end could
1853          *      keep a socket open forever with no application left this end.
1854          *      We use a 3 minute timeout (about the same as BSD) then kill
1855          *      our end. If they send after that then tough - BUT: long enough
1856          *      that we won't make the old 4*rto = almost no time - whoops
1857          *      reset mistake.
1858          *
1859          *      Nope, it was not mistake. It is really desired behaviour
1860          *      f.e. on http servers, when such sockets are useless, but
1861          *      consume significant resources. Let's do it with special
1862          *      linger2 option.                                 --ANK
1863          */
1864
1865         if (sk->sk_state == TCP_FIN_WAIT2) {
1866                 struct tcp_sock *tp = tcp_sk(sk);
1867                 if (tp->linger2 < 0) {
1868                         tcp_set_state(sk, TCP_CLOSE);
1869                         tcp_send_active_reset(sk, GFP_ATOMIC);
1870                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1871                 } else {
1872                         const int tmo = tcp_fin_time(sk);
1873
1874                         if (tmo > TCP_TIMEWAIT_LEN) {
1875                                 inet_csk_reset_keepalive_timer(sk,
1876                                                 tmo - TCP_TIMEWAIT_LEN);
1877                         } else {
1878                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1879                                 goto out;
1880                         }
1881                 }
1882         }
1883         if (sk->sk_state != TCP_CLOSE) {
1884                 sk_mem_reclaim(sk);
1885                 if (tcp_too_many_orphans(sk,
1886                                 atomic_read(sk->sk_prot->orphan_count))) {
1887                         if (net_ratelimit())
1888                                 printk(KERN_INFO "TCP: too many of orphaned "
1889                                        "sockets\n");
1890                         tcp_set_state(sk, TCP_CLOSE);
1891                         tcp_send_active_reset(sk, GFP_ATOMIC);
1892                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1893                 }
1894         }
1895
1896         if (sk->sk_state == TCP_CLOSE)
1897                 inet_csk_destroy_sock(sk);
1898         /* Otherwise, socket is reprieved until protocol close. */
1899
1900 out:
1901         bh_unlock_sock(sk);
1902         local_bh_enable();
1903         sock_put(sk);
1904 }
1905
1906 /* These states need RST on ABORT according to RFC793 */
1907
1908 static inline int tcp_need_reset(int state)
1909 {
1910         return (1 << state) &
1911                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1912                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1913 }
1914
1915 int tcp_disconnect(struct sock *sk, int flags)
1916 {
1917         struct inet_sock *inet = inet_sk(sk);
1918         struct inet_connection_sock *icsk = inet_csk(sk);
1919         struct tcp_sock *tp = tcp_sk(sk);
1920         int err = 0;
1921         int old_state = sk->sk_state;
1922
1923         if (old_state != TCP_CLOSE)
1924                 tcp_set_state(sk, TCP_CLOSE);
1925
1926         /* ABORT function of RFC793 */
1927         if (old_state == TCP_LISTEN) {
1928                 inet_csk_listen_stop(sk);
1929         } else if (tcp_need_reset(old_state) ||
1930                    (tp->snd_nxt != tp->write_seq &&
1931                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1932                 /* The last check adjusts for discrepancy of Linux wrt. RFC
1933                  * states
1934                  */
1935                 tcp_send_active_reset(sk, gfp_any());
1936                 sk->sk_err = ECONNRESET;
1937         } else if (old_state == TCP_SYN_SENT)
1938                 sk->sk_err = ECONNRESET;
1939
1940         tcp_clear_xmit_timers(sk);
1941         __skb_queue_purge(&sk->sk_receive_queue);
1942         tcp_write_queue_purge(sk);
1943         __skb_queue_purge(&tp->out_of_order_queue);
1944 #ifdef CONFIG_NET_DMA
1945         __skb_queue_purge(&sk->sk_async_wait_queue);
1946 #endif
1947
1948         inet->dport = 0;
1949
1950         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1951                 inet_reset_saddr(sk);
1952
1953         sk->sk_shutdown = 0;
1954         sock_reset_flag(sk, SOCK_DONE);
1955         tp->srtt = 0;
1956         if ((tp->write_seq += tp->max_window + 2) == 0)
1957                 tp->write_seq = 1;
1958         icsk->icsk_backoff = 0;
1959         tp->snd_cwnd = 2;
1960         icsk->icsk_probes_out = 0;
1961         tp->packets_out = 0;
1962         tp->snd_ssthresh = 0x7fffffff;
1963         tp->snd_cwnd_cnt = 0;
1964         tp->bytes_acked = 0;
1965         tcp_set_ca_state(sk, TCP_CA_Open);
1966         tcp_clear_retrans(tp);
1967         inet_csk_delack_init(sk);
1968         tcp_init_send_head(sk);
1969         memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
1970         __sk_dst_reset(sk);
1971
1972         BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
1973
1974         sk->sk_error_report(sk);
1975         return err;
1976 }
1977
1978 /*
1979  *      Socket option code for TCP.
1980  */
1981 static int do_tcp_setsockopt(struct sock *sk, int level,
1982                 int optname, char __user *optval, int optlen)
1983 {
1984         struct tcp_sock *tp = tcp_sk(sk);
1985         struct inet_connection_sock *icsk = inet_csk(sk);
1986         int val;
1987         int err = 0;
1988
1989         /* This is a string value all the others are int's */
1990         if (optname == TCP_CONGESTION) {
1991                 char name[TCP_CA_NAME_MAX];
1992
1993                 if (optlen < 1)
1994                         return -EINVAL;
1995
1996                 val = strncpy_from_user(name, optval,
1997                                         min(TCP_CA_NAME_MAX-1, optlen));
1998                 if (val < 0)
1999                         return -EFAULT;
2000                 name[val] = 0;
2001
2002                 lock_sock(sk);
2003                 err = tcp_set_congestion_control(sk, name);
2004                 release_sock(sk);
2005                 return err;
2006         }
2007
2008         if (optlen < sizeof(int))
2009                 return -EINVAL;
2010
2011         if (get_user(val, (int __user *)optval))
2012                 return -EFAULT;
2013
2014         lock_sock(sk);
2015
2016         switch (optname) {
2017         case TCP_MAXSEG:
2018                 /* Values greater than interface MTU won't take effect. However
2019                  * at the point when this call is done we typically don't yet
2020                  * know which interface is going to be used */
2021                 if (val < 8 || val > MAX_TCP_WINDOW) {
2022                         err = -EINVAL;
2023                         break;
2024                 }
2025                 tp->rx_opt.user_mss = val;
2026                 break;
2027
2028         case TCP_NODELAY:
2029                 if (val) {
2030                         /* TCP_NODELAY is weaker than TCP_CORK, so that
2031                          * this option on corked socket is remembered, but
2032                          * it is not activated until cork is cleared.
2033                          *
2034                          * However, when TCP_NODELAY is set we make
2035                          * an explicit push, which overrides even TCP_CORK
2036                          * for currently queued segments.
2037                          */
2038                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2039                         tcp_push_pending_frames(sk);
2040                 } else {
2041                         tp->nonagle &= ~TCP_NAGLE_OFF;
2042                 }
2043                 break;
2044
2045         case TCP_CORK:
2046                 /* When set indicates to always queue non-full frames.
2047                  * Later the user clears this option and we transmit
2048                  * any pending partial frames in the queue.  This is
2049                  * meant to be used alongside sendfile() to get properly
2050                  * filled frames when the user (for example) must write
2051                  * out headers with a write() call first and then use
2052                  * sendfile to send out the data parts.
2053                  *
2054                  * TCP_CORK can be set together with TCP_NODELAY and it is
2055                  * stronger than TCP_NODELAY.
2056                  */
2057                 if (val) {
2058                         tp->nonagle |= TCP_NAGLE_CORK;
2059                 } else {
2060                         tp->nonagle &= ~TCP_NAGLE_CORK;
2061                         if (tp->nonagle&TCP_NAGLE_OFF)
2062                                 tp->nonagle |= TCP_NAGLE_PUSH;
2063                         tcp_push_pending_frames(sk);
2064                 }
2065                 break;
2066
2067         case TCP_KEEPIDLE:
2068                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2069                         err = -EINVAL;
2070                 else {
2071                         tp->keepalive_time = val * HZ;
2072                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2073                             !((1 << sk->sk_state) &
2074                               (TCPF_CLOSE | TCPF_LISTEN))) {
2075                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2076                                 if (tp->keepalive_time > elapsed)
2077                                         elapsed = tp->keepalive_time - elapsed;
2078                                 else
2079                                         elapsed = 0;
2080                                 inet_csk_reset_keepalive_timer(sk, elapsed);
2081                         }
2082                 }
2083                 break;
2084         case TCP_KEEPINTVL:
2085                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2086                         err = -EINVAL;
2087                 else
2088                         tp->keepalive_intvl = val * HZ;
2089                 break;
2090         case TCP_KEEPCNT:
2091                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2092                         err = -EINVAL;
2093                 else
2094                         tp->keepalive_probes = val;
2095                 break;
2096         case TCP_SYNCNT:
2097                 if (val < 1 || val > MAX_TCP_SYNCNT)
2098                         err = -EINVAL;
2099                 else
2100                         icsk->icsk_syn_retries = val;
2101                 break;
2102
2103         case TCP_LINGER2:
2104                 if (val < 0)
2105                         tp->linger2 = -1;
2106                 else if (val > sysctl_tcp_fin_timeout / HZ)
2107                         tp->linger2 = 0;
2108                 else
2109                         tp->linger2 = val * HZ;
2110                 break;
2111
2112         case TCP_DEFER_ACCEPT:
2113                 icsk->icsk_accept_queue.rskq_defer_accept = 0;
2114                 if (val > 0) {
2115                         /* Translate value in seconds to number of
2116                          * retransmits */
2117                         while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
2118                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2119                                        icsk->icsk_accept_queue.rskq_defer_accept))
2120                                 icsk->icsk_accept_queue.rskq_defer_accept++;
2121                         icsk->icsk_accept_queue.rskq_defer_accept++;
2122                 }
2123                 break;
2124
2125         case TCP_WINDOW_CLAMP:
2126                 if (!val) {
2127                         if (sk->sk_state != TCP_CLOSE) {
2128                                 err = -EINVAL;
2129                                 break;
2130                         }
2131                         tp->window_clamp = 0;
2132                 } else
2133                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2134                                                 SOCK_MIN_RCVBUF / 2 : val;
2135                 break;
2136
2137         case TCP_QUICKACK:
2138                 if (!val) {
2139                         icsk->icsk_ack.pingpong = 1;
2140                 } else {
2141                         icsk->icsk_ack.pingpong = 0;
2142                         if ((1 << sk->sk_state) &
2143                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2144                             inet_csk_ack_scheduled(sk)) {
2145                                 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2146                                 tcp_cleanup_rbuf(sk, 1);
2147                                 if (!(val & 1))
2148                                         icsk->icsk_ack.pingpong = 1;
2149                         }
2150                 }
2151                 break;
2152
2153 #ifdef CONFIG_TCP_MD5SIG
2154         case TCP_MD5SIG:
2155                 /* Read the IP->Key mappings from userspace */
2156                 err = tp->af_specific->md5_parse(sk, optval, optlen);
2157                 break;
2158 #endif
2159
2160         default:
2161                 err = -ENOPROTOOPT;
2162                 break;
2163         }
2164
2165         release_sock(sk);
2166         return err;
2167 }
2168
2169 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2170                    int optlen)
2171 {
2172         struct inet_connection_sock *icsk = inet_csk(sk);
2173
2174         if (level != SOL_TCP)
2175                 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2176                                                      optval, optlen);
2177         return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2178 }
2179
2180 #ifdef CONFIG_COMPAT
2181 int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2182                           char __user *optval, int optlen)
2183 {
2184         if (level != SOL_TCP)
2185                 return inet_csk_compat_setsockopt(sk, level, optname,
2186                                                   optval, optlen);
2187         return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2188 }
2189
2190 EXPORT_SYMBOL(compat_tcp_setsockopt);
2191 #endif
2192
2193 /* Return information about state of tcp endpoint in API format. */
2194 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2195 {
2196         struct tcp_sock *tp = tcp_sk(sk);
2197         const struct inet_connection_sock *icsk = inet_csk(sk);
2198         u32 now = tcp_time_stamp;
2199
2200         memset(info, 0, sizeof(*info));
2201
2202         info->tcpi_state = sk->sk_state;
2203         info->tcpi_ca_state = icsk->icsk_ca_state;
2204         info->tcpi_retransmits = icsk->icsk_retransmits;
2205         info->tcpi_probes = icsk->icsk_probes_out;
2206         info->tcpi_backoff = icsk->icsk_backoff;
2207
2208         if (tp->rx_opt.tstamp_ok)
2209                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2210         if (tcp_is_sack(tp))
2211                 info->tcpi_options |= TCPI_OPT_SACK;
2212         if (tp->rx_opt.wscale_ok) {
2213                 info->tcpi_options |= TCPI_OPT_WSCALE;
2214                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2215                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2216         }
2217
2218         if (tp->ecn_flags&TCP_ECN_OK)
2219                 info->tcpi_options |= TCPI_OPT_ECN;
2220
2221         info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2222         info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2223         info->tcpi_snd_mss = tp->mss_cache;
2224         info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2225
2226         if (sk->sk_state == TCP_LISTEN) {
2227                 info->tcpi_unacked = sk->sk_ack_backlog;
2228                 info->tcpi_sacked = sk->sk_max_ack_backlog;
2229         } else {
2230                 info->tcpi_unacked = tp->packets_out;
2231                 info->tcpi_sacked = tp->sacked_out;
2232         }
2233         info->tcpi_lost = tp->lost_out;
2234         info->tcpi_retrans = tp->retrans_out;
2235         info->tcpi_fackets = tp->fackets_out;
2236
2237         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2238         info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2239         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2240
2241         info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2242         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2243         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2244         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2245         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2246         info->tcpi_snd_cwnd = tp->snd_cwnd;
2247         info->tcpi_advmss = tp->advmss;
2248         info->tcpi_reordering = tp->reordering;
2249
2250         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2251         info->tcpi_rcv_space = tp->rcvq_space.space;
2252
2253         info->tcpi_total_retrans = tp->total_retrans;
2254 }
2255
2256 EXPORT_SYMBOL_GPL(tcp_get_info);
2257
2258 static int do_tcp_getsockopt(struct sock *sk, int level,
2259                 int optname, char __user *optval, int __user *optlen)
2260 {
2261         struct inet_connection_sock *icsk = inet_csk(sk);
2262         struct tcp_sock *tp = tcp_sk(sk);
2263         int val, len;
2264
2265         if (get_user(len, optlen))
2266                 return -EFAULT;
2267
2268         len = min_t(unsigned int, len, sizeof(int));
2269
2270         if (len < 0)
2271                 return -EINVAL;
2272
2273         switch (optname) {
2274         case TCP_MAXSEG:
2275                 val = tp->mss_cache;
2276                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2277                         val = tp->rx_opt.user_mss;
2278                 break;
2279         case TCP_NODELAY:
2280                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2281                 break;
2282         case TCP_CORK:
2283                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2284                 break;
2285         case TCP_KEEPIDLE:
2286                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2287                 break;
2288         case TCP_KEEPINTVL:
2289                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2290                 break;
2291         case TCP_KEEPCNT:
2292                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2293                 break;
2294         case TCP_SYNCNT:
2295                 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2296                 break;
2297         case TCP_LINGER2:
2298                 val = tp->linger2;
2299                 if (val >= 0)
2300                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2301                 break;
2302         case TCP_DEFER_ACCEPT:
2303                 val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
2304                         ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
2305                 break;
2306         case TCP_WINDOW_CLAMP:
2307                 val = tp->window_clamp;
2308                 break;
2309         case TCP_INFO: {
2310                 struct tcp_info info;
2311
2312                 if (get_user(len, optlen))
2313                         return -EFAULT;
2314
2315                 tcp_get_info(sk, &info);
2316
2317                 len = min_t(unsigned int, len, sizeof(info));
2318                 if (put_user(len, optlen))
2319                         return -EFAULT;
2320                 if (copy_to_user(optval, &info, len))
2321                         return -EFAULT;
2322                 return 0;
2323         }
2324         case TCP_QUICKACK:
2325                 val = !icsk->icsk_ack.pingpong;
2326                 break;
2327
2328         case TCP_CONGESTION:
2329                 if (get_user(len, optlen))
2330                         return -EFAULT;
2331                 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2332                 if (put_user(len, optlen))
2333                         return -EFAULT;
2334                 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2335                         return -EFAULT;
2336                 return 0;
2337         default:
2338                 return -ENOPROTOOPT;
2339         }
2340
2341         if (put_user(len, optlen))
2342                 return -EFAULT;
2343         if (copy_to_user(optval, &val, len))
2344                 return -EFAULT;
2345         return 0;
2346 }
2347
2348 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2349                    int __user *optlen)
2350 {
2351         struct inet_connection_sock *icsk = inet_csk(sk);
2352
2353         if (level != SOL_TCP)
2354                 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2355                                                      optval, optlen);
2356         return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2357 }
2358
2359 #ifdef CONFIG_COMPAT
2360 int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2361                           char __user *optval, int __user *optlen)
2362 {
2363         if (level != SOL_TCP)
2364                 return inet_csk_compat_getsockopt(sk, level, optname,
2365                                                   optval, optlen);
2366         return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2367 }
2368
2369 EXPORT_SYMBOL(compat_tcp_getsockopt);
2370 #endif
2371
2372 struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
2373 {
2374         struct sk_buff *segs = ERR_PTR(-EINVAL);
2375         struct tcphdr *th;
2376         unsigned thlen;
2377         unsigned int seq;
2378         __be32 delta;
2379         unsigned int oldlen;
2380         unsigned int len;
2381
2382         if (!pskb_may_pull(skb, sizeof(*th)))
2383                 goto out;
2384
2385         th = tcp_hdr(skb);
2386         thlen = th->doff * 4;
2387         if (thlen < sizeof(*th))
2388                 goto out;
2389
2390         if (!pskb_may_pull(skb, thlen))
2391                 goto out;
2392
2393         oldlen = (u16)~skb->len;
2394         __skb_pull(skb, thlen);
2395
2396         if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
2397                 /* Packet is from an untrusted source, reset gso_segs. */
2398                 int type = skb_shinfo(skb)->gso_type;
2399                 int mss;
2400
2401                 if (unlikely(type &
2402                              ~(SKB_GSO_TCPV4 |
2403                                SKB_GSO_DODGY |
2404                                SKB_GSO_TCP_ECN |
2405                                SKB_GSO_TCPV6 |
2406                                0) ||
2407                              !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
2408                         goto out;
2409
2410                 mss = skb_shinfo(skb)->gso_size;
2411                 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
2412
2413                 segs = NULL;
2414                 goto out;
2415         }
2416
2417         segs = skb_segment(skb, features);
2418         if (IS_ERR(segs))
2419                 goto out;
2420
2421         len = skb_shinfo(skb)->gso_size;
2422         delta = htonl(oldlen + (thlen + len));
2423
2424         skb = segs;
2425         th = tcp_hdr(skb);
2426         seq = ntohl(th->seq);
2427
2428         do {
2429                 th->fin = th->psh = 0;
2430
2431                 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2432                                        (__force u32)delta));
2433                 if (skb->ip_summed != CHECKSUM_PARTIAL)
2434                         th->check =
2435                              csum_fold(csum_partial(skb_transport_header(skb),
2436                                                     thlen, skb->csum));
2437
2438                 seq += len;
2439                 skb = skb->next;
2440                 th = tcp_hdr(skb);
2441
2442                 th->seq = htonl(seq);
2443                 th->cwr = 0;
2444         } while (skb->next);
2445
2446         delta = htonl(oldlen + (skb->tail - skb->transport_header) +
2447                       skb->data_len);
2448         th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2449                                 (__force u32)delta));
2450         if (skb->ip_summed != CHECKSUM_PARTIAL)
2451                 th->check = csum_fold(csum_partial(skb_transport_header(skb),
2452                                                    thlen, skb->csum));
2453
2454 out:
2455         return segs;
2456 }
2457 EXPORT_SYMBOL(tcp_tso_segment);
2458
2459 #ifdef CONFIG_TCP_MD5SIG
2460 static unsigned long tcp_md5sig_users;
2461 static struct tcp_md5sig_pool **tcp_md5sig_pool;
2462 static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
2463
2464 int tcp_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
2465                       int bplen,
2466                       struct tcphdr *th, unsigned int tcplen,
2467                       struct tcp_md5sig_pool *hp)
2468 {
2469         struct scatterlist sg[4];
2470         __u16 data_len;
2471         int block = 0;
2472         __sum16 cksum;
2473         struct hash_desc *desc = &hp->md5_desc;
2474         int err;
2475         unsigned int nbytes = 0;
2476
2477         sg_init_table(sg, 4);
2478
2479         /* 1. The TCP pseudo-header */
2480         sg_set_buf(&sg[block++], &hp->md5_blk, bplen);
2481         nbytes += bplen;
2482
2483         /* 2. The TCP header, excluding options, and assuming a
2484          * checksum of zero
2485          */
2486         cksum = th->check;
2487         th->check = 0;
2488         sg_set_buf(&sg[block++], th, sizeof(*th));
2489         nbytes += sizeof(*th);
2490
2491         /* 3. The TCP segment data (if any) */
2492         data_len = tcplen - (th->doff << 2);
2493         if (data_len > 0) {
2494                 u8 *data = (u8 *)th + (th->doff << 2);
2495                 sg_set_buf(&sg[block++], data, data_len);
2496                 nbytes += data_len;
2497         }
2498
2499         /* 4. an independently-specified key or password, known to both
2500          * TCPs and presumably connection-specific
2501          */
2502         sg_set_buf(&sg[block++], key->key, key->keylen);
2503         nbytes += key->keylen;
2504
2505         sg_mark_end(&sg[block - 1]);
2506
2507         /* Now store the hash into the packet */
2508         err = crypto_hash_init(desc);
2509         if (err) {
2510                 if (net_ratelimit())
2511                         printk(KERN_WARNING "%s(): hash_init failed\n", __func__);
2512                 return -1;
2513         }
2514         err = crypto_hash_update(desc, sg, nbytes);
2515         if (err) {
2516                 if (net_ratelimit())
2517                         printk(KERN_WARNING "%s(): hash_update failed\n", __func__);
2518                 return -1;
2519         }
2520         err = crypto_hash_final(desc, md5_hash);
2521         if (err) {
2522                 if (net_ratelimit())
2523                         printk(KERN_WARNING "%s(): hash_final failed\n", __func__);
2524                 return -1;
2525         }
2526
2527         /* Reset header */
2528         th->check = cksum;
2529
2530         return 0;
2531 }
2532 EXPORT_SYMBOL(tcp_calc_md5_hash);
2533
2534 static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2535 {
2536         int cpu;
2537         for_each_possible_cpu(cpu) {
2538                 struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu);
2539                 if (p) {
2540                         if (p->md5_desc.tfm)
2541                                 crypto_free_hash(p->md5_desc.tfm);
2542                         kfree(p);
2543                         p = NULL;
2544                 }
2545         }
2546         free_percpu(pool);
2547 }
2548
2549 void tcp_free_md5sig_pool(void)
2550 {
2551         struct tcp_md5sig_pool **pool = NULL;
2552
2553         spin_lock_bh(&tcp_md5sig_pool_lock);
2554         if (--tcp_md5sig_users == 0) {
2555                 pool = tcp_md5sig_pool;
2556                 tcp_md5sig_pool = NULL;
2557         }
2558         spin_unlock_bh(&tcp_md5sig_pool_lock);
2559         if (pool)
2560                 __tcp_free_md5sig_pool(pool);
2561 }
2562
2563 EXPORT_SYMBOL(tcp_free_md5sig_pool);
2564
2565 static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void)
2566 {
2567         int cpu;
2568         struct tcp_md5sig_pool **pool;
2569
2570         pool = alloc_percpu(struct tcp_md5sig_pool *);
2571         if (!pool)
2572                 return NULL;
2573
2574         for_each_possible_cpu(cpu) {
2575                 struct tcp_md5sig_pool *p;
2576                 struct crypto_hash *hash;
2577
2578                 p = kzalloc(sizeof(*p), GFP_KERNEL);
2579                 if (!p)
2580                         goto out_free;
2581                 *per_cpu_ptr(pool, cpu) = p;
2582
2583                 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
2584                 if (!hash || IS_ERR(hash))
2585                         goto out_free;
2586
2587                 p->md5_desc.tfm = hash;
2588         }
2589         return pool;
2590 out_free:
2591         __tcp_free_md5sig_pool(pool);
2592         return NULL;
2593 }
2594
2595 struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(void)
2596 {
2597         struct tcp_md5sig_pool **pool;
2598         int alloc = 0;
2599
2600 retry:
2601         spin_lock_bh(&tcp_md5sig_pool_lock);
2602         pool = tcp_md5sig_pool;
2603         if (tcp_md5sig_users++ == 0) {
2604                 alloc = 1;
2605                 spin_unlock_bh(&tcp_md5sig_pool_lock);
2606         } else if (!pool) {
2607                 tcp_md5sig_users--;
2608                 spin_unlock_bh(&tcp_md5sig_pool_lock);
2609                 cpu_relax();
2610                 goto retry;
2611         } else
2612                 spin_unlock_bh(&tcp_md5sig_pool_lock);
2613
2614         if (alloc) {
2615                 /* we cannot hold spinlock here because this may sleep. */
2616                 struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool();
2617                 spin_lock_bh(&tcp_md5sig_pool_lock);
2618                 if (!p) {
2619                         tcp_md5sig_users--;
2620                         spin_unlock_bh(&tcp_md5sig_pool_lock);
2621                         return NULL;
2622                 }
2623                 pool = tcp_md5sig_pool;
2624                 if (pool) {
2625                         /* oops, it has already been assigned. */
2626                         spin_unlock_bh(&tcp_md5sig_pool_lock);
2627                         __tcp_free_md5sig_pool(p);
2628                 } else {
2629                         tcp_md5sig_pool = pool = p;
2630                         spin_unlock_bh(&tcp_md5sig_pool_lock);
2631                 }
2632         }
2633         return pool;
2634 }
2635
2636 EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2637
2638 struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu)
2639 {
2640         struct tcp_md5sig_pool **p;
2641         spin_lock_bh(&tcp_md5sig_pool_lock);
2642         p = tcp_md5sig_pool;
2643         if (p)
2644                 tcp_md5sig_users++;
2645         spin_unlock_bh(&tcp_md5sig_pool_lock);
2646         return (p ? *per_cpu_ptr(p, cpu) : NULL);
2647 }
2648
2649 EXPORT_SYMBOL(__tcp_get_md5sig_pool);
2650
2651 void __tcp_put_md5sig_pool(void)
2652 {
2653         tcp_free_md5sig_pool();
2654 }
2655
2656 EXPORT_SYMBOL(__tcp_put_md5sig_pool);
2657 #endif
2658
2659 void tcp_done(struct sock *sk)
2660 {
2661         if(sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
2662                 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
2663
2664         tcp_set_state(sk, TCP_CLOSE);
2665         tcp_clear_xmit_timers(sk);
2666
2667         sk->sk_shutdown = SHUTDOWN_MASK;
2668
2669         if (!sock_flag(sk, SOCK_DEAD))
2670                 sk->sk_state_change(sk);
2671         else
2672                 inet_csk_destroy_sock(sk);
2673 }
2674 EXPORT_SYMBOL_GPL(tcp_done);
2675
2676 extern struct tcp_congestion_ops tcp_reno;
2677
2678 static __initdata unsigned long thash_entries;
2679 static int __init set_thash_entries(char *str)
2680 {
2681         if (!str)
2682                 return 0;
2683         thash_entries = simple_strtoul(str, &str, 0);
2684         return 1;
2685 }
2686 __setup("thash_entries=", set_thash_entries);
2687
2688 void __init tcp_init(void)
2689 {
2690         struct sk_buff *skb = NULL;
2691         unsigned long limit;
2692         int order, i, max_share;
2693
2694         BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
2695
2696         tcp_hashinfo.bind_bucket_cachep =
2697                 kmem_cache_create("tcp_bind_bucket",
2698                                   sizeof(struct inet_bind_bucket), 0,
2699                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2700
2701         /* Size and allocate the main established and bind bucket
2702          * hash tables.
2703          *
2704          * The methodology is similar to that of the buffer cache.
2705          */
2706         tcp_hashinfo.ehash =
2707                 alloc_large_system_hash("TCP established",
2708                                         sizeof(struct inet_ehash_bucket),
2709                                         thash_entries,
2710                                         (num_physpages >= 128 * 1024) ?
2711                                         13 : 15,
2712                                         0,
2713                                         &tcp_hashinfo.ehash_size,
2714                                         NULL,
2715                                         thash_entries ? 0 : 512 * 1024);
2716         tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
2717         for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
2718                 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2719                 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain);
2720         }
2721         if (inet_ehash_locks_alloc(&tcp_hashinfo))
2722                 panic("TCP: failed to alloc ehash_locks");
2723         tcp_hashinfo.bhash =
2724                 alloc_large_system_hash("TCP bind",
2725                                         sizeof(struct inet_bind_hashbucket),
2726                                         tcp_hashinfo.ehash_size,
2727                                         (num_physpages >= 128 * 1024) ?
2728                                         13 : 15,
2729                                         0,
2730                                         &tcp_hashinfo.bhash_size,
2731                                         NULL,
2732                                         64 * 1024);
2733         tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2734         for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2735                 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2736                 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2737         }
2738
2739         /* Try to be a bit smarter and adjust defaults depending
2740          * on available memory.
2741          */
2742         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2743                         (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2744                         order++)
2745                 ;
2746         if (order >= 4) {
2747                 tcp_death_row.sysctl_max_tw_buckets = 180000;
2748                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2749                 sysctl_max_syn_backlog = 1024;
2750         } else if (order < 3) {
2751                 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2752                 sysctl_tcp_max_orphans >>= (3 - order);
2753                 sysctl_max_syn_backlog = 128;
2754         }
2755
2756         /* Set the pressure threshold to be a fraction of global memory that
2757          * is up to 1/2 at 256 MB, decreasing toward zero with the amount of
2758          * memory, with a floor of 128 pages.
2759          */
2760         limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
2761         limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
2762         limit = max(limit, 128UL);
2763         sysctl_tcp_mem[0] = limit / 4 * 3;
2764         sysctl_tcp_mem[1] = limit;
2765         sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
2766
2767         /* Set per-socket limits to no more than 1/128 the pressure threshold */
2768         limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
2769         max_share = min(4UL*1024*1024, limit);
2770
2771         sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
2772         sysctl_tcp_wmem[1] = 16*1024;
2773         sysctl_tcp_wmem[2] = max(64*1024, max_share);
2774
2775         sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
2776         sysctl_tcp_rmem[1] = 87380;
2777         sysctl_tcp_rmem[2] = max(87380, max_share);
2778
2779         printk(KERN_INFO "TCP: Hash tables configured "
2780                "(established %d bind %d)\n",
2781                tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size);
2782
2783         tcp_register_congestion_control(&tcp_reno);
2784 }
2785
2786 EXPORT_SYMBOL(tcp_close);
2787 EXPORT_SYMBOL(tcp_disconnect);
2788 EXPORT_SYMBOL(tcp_getsockopt);
2789 EXPORT_SYMBOL(tcp_ioctl);
2790 EXPORT_SYMBOL(tcp_poll);
2791 EXPORT_SYMBOL(tcp_read_sock);
2792 EXPORT_SYMBOL(tcp_recvmsg);
2793 EXPORT_SYMBOL(tcp_sendmsg);
2794 EXPORT_SYMBOL(tcp_splice_read);
2795 EXPORT_SYMBOL(tcp_sendpage);
2796 EXPORT_SYMBOL(tcp_setsockopt);
2797 EXPORT_SYMBOL(tcp_shutdown);
2798 EXPORT_SYMBOL(tcp_statistics);