[ICSK]: Generalise tcp_listen_{start,stop}
[safe/jmp/linux-2.6] / net / ipv4 / tcp.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *              Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *              Alan Cox        :       Numerous verify_area() calls
24  *              Alan Cox        :       Set the ACK bit on a reset
25  *              Alan Cox        :       Stopped it crashing if it closed while
26  *                                      sk->inuse=1 and was trying to connect
27  *                                      (tcp_err()).
28  *              Alan Cox        :       All icmp error handling was broken
29  *                                      pointers passed where wrong and the
30  *                                      socket was looked up backwards. Nobody
31  *                                      tested any icmp error code obviously.
32  *              Alan Cox        :       tcp_err() now handled properly. It
33  *                                      wakes people on errors. poll
34  *                                      behaves and the icmp error race
35  *                                      has gone by moving it into sock.c
36  *              Alan Cox        :       tcp_send_reset() fixed to work for
37  *                                      everything not just packets for
38  *                                      unknown sockets.
39  *              Alan Cox        :       tcp option processing.
40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
41  *                                      syn rule wrong]
42  *              Herp Rosmanith  :       More reset fixes
43  *              Alan Cox        :       No longer acks invalid rst frames.
44  *                                      Acking any kind of RST is right out.
45  *              Alan Cox        :       Sets an ignore me flag on an rst
46  *                                      receive otherwise odd bits of prattle
47  *                                      escape still
48  *              Alan Cox        :       Fixed another acking RST frame bug.
49  *                                      Should stop LAN workplace lockups.
50  *              Alan Cox        :       Some tidyups using the new skb list
51  *                                      facilities
52  *              Alan Cox        :       sk->keepopen now seems to work
53  *              Alan Cox        :       Pulls options out correctly on accepts
54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
56  *                                      bit to skb ops.
57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
58  *                                      nasty.
59  *              Alan Cox        :       Added some better commenting, as the
60  *                                      tcp is hard to follow
61  *              Alan Cox        :       Removed incorrect check for 20 * psh
62  *      Michael O'Reilly        :       ack < copied bug fix.
63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
64  *              Alan Cox        :       FIN with no memory -> CRASH
65  *              Alan Cox        :       Added socket option proto entries.
66  *                                      Also added awareness of them to accept.
67  *              Alan Cox        :       Added TCP options (SOL_TCP)
68  *              Alan Cox        :       Switched wakeup calls to callbacks,
69  *                                      so the kernel can layer network
70  *                                      sockets.
71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
73  *              Alan Cox        :       RST frames sent on unsynchronised
74  *                                      state ack error.
75  *              Alan Cox        :       Put in missing check for SYN bit.
76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
77  *                                      window non shrink trick.
78  *              Alan Cox        :       Added a couple of small NET2E timer
79  *                                      fixes
80  *              Charles Hedrick :       TCP fixes
81  *              Toomas Tamm     :       TCP window fixes
82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
83  *              Charles Hedrick :       Rewrote most of it to actually work
84  *              Linus           :       Rewrote tcp_read() and URG handling
85  *                                      completely
86  *              Gerhard Koerting:       Fixed some missing timer handling
87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
88  *              Gerhard Koerting:       PC/TCP workarounds
89  *              Adam Caldwell   :       Assorted timer/timing errors
90  *              Matthew Dillon  :       Fixed another RST bug
91  *              Alan Cox        :       Move to kernel side addressing changes.
92  *              Alan Cox        :       Beginning work on TCP fastpathing
93  *                                      (not yet usable)
94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
95  *              Alan Cox        :       TCP fast path debugging
96  *              Alan Cox        :       Window clamping
97  *              Michael Riepe   :       Bug in tcp_check()
98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
99  *              Matt Dillon     :       Yet more small nasties remove from the
100  *                                      TCP code (Be very nice to this man if
101  *                                      tcp finally works 100%) 8)
102  *              Alan Cox        :       BSD accept semantics.
103  *              Alan Cox        :       Reset on closedown bug.
104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105  *              Michael Pall    :       Handle poll() after URG properly in
106  *                                      all cases.
107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108  *                                      (multi URG PUSH broke rlogin).
109  *              Michael Pall    :       Fix the multi URG PUSH problem in
110  *                                      tcp_readable(), poll() after URG
111  *                                      works now.
112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113  *                                      BSD api.
114  *              Alan Cox        :       Changed the semantics of sk->socket to
115  *                                      fix a race and a signal problem with
116  *                                      accept() and async I/O.
117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120  *                                      clients/servers which listen in on
121  *                                      fixed ports.
122  *              Alan Cox        :       Cleaned the above up and shrank it to
123  *                                      a sensible code size.
124  *              Alan Cox        :       Self connect lockup fix.
125  *              Alan Cox        :       No connect to multicast.
126  *              Ross Biro       :       Close unaccepted children on master
127  *                                      socket close.
128  *              Alan Cox        :       Reset tracing code.
129  *              Alan Cox        :       Spurious resets on shutdown.
130  *              Alan Cox        :       Giant 15 minute/60 second timer error
131  *              Alan Cox        :       Small whoops in polling before an
132  *                                      accept.
133  *              Alan Cox        :       Kept the state trace facility since
134  *                                      it's handy for debugging.
135  *              Alan Cox        :       More reset handler fixes.
136  *              Alan Cox        :       Started rewriting the code based on
137  *                                      the RFC's for other useful protocol
138  *                                      references see: Comer, KA9Q NOS, and
139  *                                      for a reference on the difference
140  *                                      between specifications and how BSD
141  *                                      works see the 4.4lite source.
142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143  *                                      close.
144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146  *              Alan Cox        :       Reimplemented timers as per the RFC
147  *                                      and using multiple timers for sanity.
148  *              Alan Cox        :       Small bug fixes, and a lot of new
149  *                                      comments.
150  *              Alan Cox        :       Fixed dual reader crash by locking
151  *                                      the buffers (much like datagram.c)
152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153  *                                      now gets fed up of retrying without
154  *                                      (even a no space) answer.
155  *              Alan Cox        :       Extracted closing code better
156  *              Alan Cox        :       Fixed the closing state machine to
157  *                                      resemble the RFC.
158  *              Alan Cox        :       More 'per spec' fixes.
159  *              Jorge Cwik      :       Even faster checksumming.
160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161  *                                      only frames. At least one pc tcp stack
162  *                                      generates them.
163  *              Alan Cox        :       Cache last socket.
164  *              Alan Cox        :       Per route irtt.
165  *              Matt Day        :       poll()->select() match BSD precisely on error
166  *              Alan Cox        :       New buffers
167  *              Marc Tamsky     :       Various sk->prot->retransmits and
168  *                                      sk->retransmits misupdating fixed.
169  *                                      Fixed tcp_write_timeout: stuck close,
170  *                                      and TCP syn retries gets used now.
171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172  *                                      ack if state is TCP_CLOSED.
173  *              Alan Cox        :       Look up device on a retransmit - routes may
174  *                                      change. Doesn't yet cope with MSS shrink right
175  *                                      but it's a start!
176  *              Marc Tamsky     :       Closing in closing fixes.
177  *              Mike Shaver     :       RFC1122 verifications.
178  *              Alan Cox        :       rcv_saddr errors.
179  *              Alan Cox        :       Block double connect().
180  *              Alan Cox        :       Small hooks for enSKIP.
181  *              Alexey Kuznetsov:       Path MTU discovery.
182  *              Alan Cox        :       Support soft errors.
183  *              Alan Cox        :       Fix MTU discovery pathological case
184  *                                      when the remote claims no mtu!
185  *              Marc Tamsky     :       TCP_CLOSE fix.
186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187  *                                      window but wrong (fixes NT lpd problems)
188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
189  *              Joerg Reuter    :       No modification of locked buffers in
190  *                                      tcp_do_retransmit()
191  *              Eric Schenk     :       Changed receiver side silly window
192  *                                      avoidance algorithm to BSD style
193  *                                      algorithm. This doubles throughput
194  *                                      against machines running Solaris,
195  *                                      and seems to result in general
196  *                                      improvement.
197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
198  *      Willy Konynenberg       :       Transparent proxying support.
199  *      Mike McLagan            :       Routing by source
200  *              Keith Owens     :       Do proper merging with partial SKB's in
201  *                                      tcp_do_sendmsg to avoid burstiness.
202  *              Eric Schenk     :       Fix fast close down bug with
203  *                                      shutdown() followed by close().
204  *              Andi Kleen      :       Make poll agree with SIGIO
205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
206  *                                      lingertime == 0 (RFC 793 ABORT Call)
207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
208  *                                      csum_and_copy_from_user() if possible.
209  *
210  *              This program is free software; you can redistribute it and/or
211  *              modify it under the terms of the GNU General Public License
212  *              as published by the Free Software Foundation; either version
213  *              2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
218  *
219  *      TCP_SYN_RECV            received a connection request, sent ack,
220  *                              waiting for final ack in three-way handshake.
221  *
222  *      TCP_ESTABLISHED         connection established
223  *
224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
225  *                              transmission of remaining buffered data
226  *
227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
228  *                              to shutdown
229  *
230  *      TCP_CLOSING             both sides have shutdown but we still have
231  *                              data we have to finish sending
232  *
233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
234  *                              closed, can only be entered from FIN_WAIT2
235  *                              or CLOSING.  Required because the other end
236  *                              may not have gotten our last ACK causing it
237  *                              to retransmit the data packet (which we ignore)
238  *
239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
240  *                              us to finish writing our data and to shutdown
241  *                              (we have to close() to move on to LAST_ACK)
242  *
243  *      TCP_LAST_ACK            out side has shutdown after remote has
244  *                              shutdown.  There may still be data in our
245  *                              buffer that we have to finish sending
246  *
247  *      TCP_CLOSE               socket is finished
248  */
249
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259 #include <linux/bootmem.h>
260
261 #include <net/icmp.h>
262 #include <net/tcp.h>
263 #include <net/xfrm.h>
264 #include <net/ip.h>
265
266
267 #include <asm/uaccess.h>
268 #include <asm/ioctls.h>
269
270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273
274 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
275
276 EXPORT_SYMBOL_GPL(tcp_orphan_count);
277
278 int sysctl_tcp_mem[3];
279 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
280 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
281
282 EXPORT_SYMBOL(sysctl_tcp_mem);
283 EXPORT_SYMBOL(sysctl_tcp_rmem);
284 EXPORT_SYMBOL(sysctl_tcp_wmem);
285
286 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
287 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
288
289 EXPORT_SYMBOL(tcp_memory_allocated);
290 EXPORT_SYMBOL(tcp_sockets_allocated);
291
292 /*
293  * Pressure flag: try to collapse.
294  * Technical note: it is used by multiple contexts non atomically.
295  * All the sk_stream_mem_schedule() is of this nature: accounting
296  * is strict, actions are advisory and have some latency.
297  */
298 int tcp_memory_pressure;
299
300 EXPORT_SYMBOL(tcp_memory_pressure);
301
302 void tcp_enter_memory_pressure(void)
303 {
304         if (!tcp_memory_pressure) {
305                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
306                 tcp_memory_pressure = 1;
307         }
308 }
309
310 EXPORT_SYMBOL(tcp_enter_memory_pressure);
311
312 /*
313  * LISTEN is a special case for poll..
314  */
315 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
316                                                poll_table *wait)
317 {
318         return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ? (POLLIN | POLLRDNORM) : 0;
319 }
320
321 /*
322  *      Wait for a TCP event.
323  *
324  *      Note that we don't need to lock the socket, as the upper poll layers
325  *      take care of normal races (between the test and the event) and we don't
326  *      go look at any of the socket buffers directly.
327  */
328 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
329 {
330         unsigned int mask;
331         struct sock *sk = sock->sk;
332         struct tcp_sock *tp = tcp_sk(sk);
333
334         poll_wait(file, sk->sk_sleep, wait);
335         if (sk->sk_state == TCP_LISTEN)
336                 return tcp_listen_poll(sk, wait);
337
338         /* Socket is not locked. We are protected from async events
339            by poll logic and correct handling of state changes
340            made by another threads is impossible in any case.
341          */
342
343         mask = 0;
344         if (sk->sk_err)
345                 mask = POLLERR;
346
347         /*
348          * POLLHUP is certainly not done right. But poll() doesn't
349          * have a notion of HUP in just one direction, and for a
350          * socket the read side is more interesting.
351          *
352          * Some poll() documentation says that POLLHUP is incompatible
353          * with the POLLOUT/POLLWR flags, so somebody should check this
354          * all. But careful, it tends to be safer to return too many
355          * bits than too few, and you can easily break real applications
356          * if you don't tell them that something has hung up!
357          *
358          * Check-me.
359          *
360          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
361          * our fs/select.c). It means that after we received EOF,
362          * poll always returns immediately, making impossible poll() on write()
363          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
364          * if and only if shutdown has been made in both directions.
365          * Actually, it is interesting to look how Solaris and DUX
366          * solve this dilemma. I would prefer, if PULLHUP were maskable,
367          * then we could set it on SND_SHUTDOWN. BTW examples given
368          * in Stevens' books assume exactly this behaviour, it explains
369          * why PULLHUP is incompatible with POLLOUT.    --ANK
370          *
371          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
372          * blocking on fresh not-connected or disconnected socket. --ANK
373          */
374         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
375                 mask |= POLLHUP;
376         if (sk->sk_shutdown & RCV_SHUTDOWN)
377                 mask |= POLLIN | POLLRDNORM;
378
379         /* Connected? */
380         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
381                 /* Potential race condition. If read of tp below will
382                  * escape above sk->sk_state, we can be illegally awaken
383                  * in SYN_* states. */
384                 if ((tp->rcv_nxt != tp->copied_seq) &&
385                     (tp->urg_seq != tp->copied_seq ||
386                      tp->rcv_nxt != tp->copied_seq + 1 ||
387                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
388                         mask |= POLLIN | POLLRDNORM;
389
390                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
391                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
392                                 mask |= POLLOUT | POLLWRNORM;
393                         } else {  /* send SIGIO later */
394                                 set_bit(SOCK_ASYNC_NOSPACE,
395                                         &sk->sk_socket->flags);
396                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
397
398                                 /* Race breaker. If space is freed after
399                                  * wspace test but before the flags are set,
400                                  * IO signal will be lost.
401                                  */
402                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
403                                         mask |= POLLOUT | POLLWRNORM;
404                         }
405                 }
406
407                 if (tp->urg_data & TCP_URG_VALID)
408                         mask |= POLLPRI;
409         }
410         return mask;
411 }
412
413 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
414 {
415         struct tcp_sock *tp = tcp_sk(sk);
416         int answ;
417
418         switch (cmd) {
419         case SIOCINQ:
420                 if (sk->sk_state == TCP_LISTEN)
421                         return -EINVAL;
422
423                 lock_sock(sk);
424                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
425                         answ = 0;
426                 else if (sock_flag(sk, SOCK_URGINLINE) ||
427                          !tp->urg_data ||
428                          before(tp->urg_seq, tp->copied_seq) ||
429                          !before(tp->urg_seq, tp->rcv_nxt)) {
430                         answ = tp->rcv_nxt - tp->copied_seq;
431
432                         /* Subtract 1, if FIN is in queue. */
433                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
434                                 answ -=
435                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
436                 } else
437                         answ = tp->urg_seq - tp->copied_seq;
438                 release_sock(sk);
439                 break;
440         case SIOCATMARK:
441                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
442                 break;
443         case SIOCOUTQ:
444                 if (sk->sk_state == TCP_LISTEN)
445                         return -EINVAL;
446
447                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
448                         answ = 0;
449                 else
450                         answ = tp->write_seq - tp->snd_una;
451                 break;
452         default:
453                 return -ENOIOCTLCMD;
454         };
455
456         return put_user(answ, (int __user *)arg);
457 }
458
459 int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
460 {
461         struct inet_sock *inet = inet_sk(sk);
462         struct inet_connection_sock *icsk = inet_csk(sk);
463         int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
464
465         if (rc != 0)
466                 return rc;
467
468         sk->sk_max_ack_backlog = 0;
469         sk->sk_ack_backlog = 0;
470         inet_csk_delack_init(sk);
471
472         /* There is race window here: we announce ourselves listening,
473          * but this transition is still not validated by get_port().
474          * It is OK, because this socket enters to hash table only
475          * after validation is complete.
476          */
477         sk->sk_state = TCP_LISTEN;
478         if (!sk->sk_prot->get_port(sk, inet->num)) {
479                 inet->sport = htons(inet->num);
480
481                 sk_dst_reset(sk);
482                 sk->sk_prot->hash(sk);
483
484                 return 0;
485         }
486
487         sk->sk_state = TCP_CLOSE;
488         __reqsk_queue_destroy(&icsk->icsk_accept_queue);
489         return -EADDRINUSE;
490 }
491
492 EXPORT_SYMBOL_GPL(inet_csk_listen_start);
493
494 /*
495  *      This routine closes sockets which have been at least partially
496  *      opened, but not yet accepted.
497  */
498 static void inet_csk_listen_stop(struct sock *sk)
499 {
500         struct inet_connection_sock *icsk = inet_csk(sk);
501         struct request_sock *acc_req;
502         struct request_sock *req;
503
504         inet_csk_delete_keepalive_timer(sk);
505
506         /* make all the listen_opt local to us */
507         acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
508
509         /* Following specs, it would be better either to send FIN
510          * (and enter FIN-WAIT-1, it is normal close)
511          * or to send active reset (abort).
512          * Certainly, it is pretty dangerous while synflood, but it is
513          * bad justification for our negligence 8)
514          * To be honest, we are not able to make either
515          * of the variants now.                 --ANK
516          */
517         reqsk_queue_destroy(&icsk->icsk_accept_queue);
518
519         while ((req = acc_req) != NULL) {
520                 struct sock *child = req->sk;
521
522                 acc_req = req->dl_next;
523
524                 local_bh_disable();
525                 bh_lock_sock(child);
526                 BUG_TRAP(!sock_owned_by_user(child));
527                 sock_hold(child);
528
529                 sk->sk_prot->disconnect(child, O_NONBLOCK);
530
531                 sock_orphan(child);
532
533                 atomic_inc(sk->sk_prot->orphan_count);
534
535                 inet_csk_destroy_sock(child);
536
537                 bh_unlock_sock(child);
538                 local_bh_enable();
539                 sock_put(child);
540
541                 sk_acceptq_removed(sk);
542                 __reqsk_free(req);
543         }
544         BUG_TRAP(!sk->sk_ack_backlog);
545 }
546
547 EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
548
549 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
550 {
551         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
552         tp->pushed_seq = tp->write_seq;
553 }
554
555 static inline int forced_push(struct tcp_sock *tp)
556 {
557         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
558 }
559
560 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
561                               struct sk_buff *skb)
562 {
563         skb->csum = 0;
564         TCP_SKB_CB(skb)->seq = tp->write_seq;
565         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
566         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
567         TCP_SKB_CB(skb)->sacked = 0;
568         skb_header_release(skb);
569         __skb_queue_tail(&sk->sk_write_queue, skb);
570         sk_charge_skb(sk, skb);
571         if (!sk->sk_send_head)
572                 sk->sk_send_head = skb;
573         if (tp->nonagle & TCP_NAGLE_PUSH)
574                 tp->nonagle &= ~TCP_NAGLE_PUSH; 
575 }
576
577 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
578                                 struct sk_buff *skb)
579 {
580         if (flags & MSG_OOB) {
581                 tp->urg_mode = 1;
582                 tp->snd_up = tp->write_seq;
583                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
584         }
585 }
586
587 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
588                             int mss_now, int nonagle)
589 {
590         if (sk->sk_send_head) {
591                 struct sk_buff *skb = sk->sk_write_queue.prev;
592                 if (!(flags & MSG_MORE) || forced_push(tp))
593                         tcp_mark_push(tp, skb);
594                 tcp_mark_urg(tp, flags, skb);
595                 __tcp_push_pending_frames(sk, tp, mss_now,
596                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
597         }
598 }
599
600 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
601                          size_t psize, int flags)
602 {
603         struct tcp_sock *tp = tcp_sk(sk);
604         int mss_now, size_goal;
605         int err;
606         ssize_t copied;
607         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
608
609         /* Wait for a connection to finish. */
610         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
611                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
612                         goto out_err;
613
614         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
615
616         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
617         size_goal = tp->xmit_size_goal;
618         copied = 0;
619
620         err = -EPIPE;
621         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
622                 goto do_error;
623
624         while (psize > 0) {
625                 struct sk_buff *skb = sk->sk_write_queue.prev;
626                 struct page *page = pages[poffset / PAGE_SIZE];
627                 int copy, i, can_coalesce;
628                 int offset = poffset % PAGE_SIZE;
629                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
630
631                 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
632 new_segment:
633                         if (!sk_stream_memory_free(sk))
634                                 goto wait_for_sndbuf;
635
636                         skb = sk_stream_alloc_pskb(sk, 0, 0,
637                                                    sk->sk_allocation);
638                         if (!skb)
639                                 goto wait_for_memory;
640
641                         skb_entail(sk, tp, skb);
642                         copy = size_goal;
643                 }
644
645                 if (copy > size)
646                         copy = size;
647
648                 i = skb_shinfo(skb)->nr_frags;
649                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
650                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
651                         tcp_mark_push(tp, skb);
652                         goto new_segment;
653                 }
654                 if (sk->sk_forward_alloc < copy &&
655                     !sk_stream_mem_schedule(sk, copy, 0))
656                         goto wait_for_memory;
657                 
658                 if (can_coalesce) {
659                         skb_shinfo(skb)->frags[i - 1].size += copy;
660                 } else {
661                         get_page(page);
662                         skb_fill_page_desc(skb, i, page, offset, copy);
663                 }
664
665                 skb->len += copy;
666                 skb->data_len += copy;
667                 skb->truesize += copy;
668                 sk->sk_wmem_queued += copy;
669                 sk->sk_forward_alloc -= copy;
670                 skb->ip_summed = CHECKSUM_HW;
671                 tp->write_seq += copy;
672                 TCP_SKB_CB(skb)->end_seq += copy;
673                 skb_shinfo(skb)->tso_segs = 0;
674
675                 if (!copied)
676                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
677
678                 copied += copy;
679                 poffset += copy;
680                 if (!(psize -= copy))
681                         goto out;
682
683                 if (skb->len < mss_now || (flags & MSG_OOB))
684                         continue;
685
686                 if (forced_push(tp)) {
687                         tcp_mark_push(tp, skb);
688                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
689                 } else if (skb == sk->sk_send_head)
690                         tcp_push_one(sk, mss_now);
691                 continue;
692
693 wait_for_sndbuf:
694                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
695 wait_for_memory:
696                 if (copied)
697                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
698
699                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
700                         goto do_error;
701
702                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
703                 size_goal = tp->xmit_size_goal;
704         }
705
706 out:
707         if (copied)
708                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
709         return copied;
710
711 do_error:
712         if (copied)
713                 goto out;
714 out_err:
715         return sk_stream_error(sk, flags, err);
716 }
717
718 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
719                      size_t size, int flags)
720 {
721         ssize_t res;
722         struct sock *sk = sock->sk;
723
724 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
725
726         if (!(sk->sk_route_caps & NETIF_F_SG) ||
727             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
728                 return sock_no_sendpage(sock, page, offset, size, flags);
729
730 #undef TCP_ZC_CSUM_FLAGS
731
732         lock_sock(sk);
733         TCP_CHECK_TIMER(sk);
734         res = do_tcp_sendpages(sk, &page, offset, size, flags);
735         TCP_CHECK_TIMER(sk);
736         release_sock(sk);
737         return res;
738 }
739
740 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
741 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
742
743 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
744 {
745         int tmp = tp->mss_cache;
746
747         if (sk->sk_route_caps & NETIF_F_SG) {
748                 if (sk->sk_route_caps & NETIF_F_TSO)
749                         tmp = 0;
750                 else {
751                         int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
752
753                         if (tmp >= pgbreak &&
754                             tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
755                                 tmp = pgbreak;
756                 }
757         }
758
759         return tmp;
760 }
761
762 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
763                 size_t size)
764 {
765         struct iovec *iov;
766         struct tcp_sock *tp = tcp_sk(sk);
767         struct sk_buff *skb;
768         int iovlen, flags;
769         int mss_now, size_goal;
770         int err, copied;
771         long timeo;
772
773         lock_sock(sk);
774         TCP_CHECK_TIMER(sk);
775
776         flags = msg->msg_flags;
777         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
778
779         /* Wait for a connection to finish. */
780         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
781                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
782                         goto out_err;
783
784         /* This should be in poll */
785         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
786
787         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
788         size_goal = tp->xmit_size_goal;
789
790         /* Ok commence sending. */
791         iovlen = msg->msg_iovlen;
792         iov = msg->msg_iov;
793         copied = 0;
794
795         err = -EPIPE;
796         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
797                 goto do_error;
798
799         while (--iovlen >= 0) {
800                 int seglen = iov->iov_len;
801                 unsigned char __user *from = iov->iov_base;
802
803                 iov++;
804
805                 while (seglen > 0) {
806                         int copy;
807
808                         skb = sk->sk_write_queue.prev;
809
810                         if (!sk->sk_send_head ||
811                             (copy = size_goal - skb->len) <= 0) {
812
813 new_segment:
814                                 /* Allocate new segment. If the interface is SG,
815                                  * allocate skb fitting to single page.
816                                  */
817                                 if (!sk_stream_memory_free(sk))
818                                         goto wait_for_sndbuf;
819
820                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
821                                                            0, sk->sk_allocation);
822                                 if (!skb)
823                                         goto wait_for_memory;
824
825                                 /*
826                                  * Check whether we can use HW checksum.
827                                  */
828                                 if (sk->sk_route_caps &
829                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
830                                      NETIF_F_HW_CSUM))
831                                         skb->ip_summed = CHECKSUM_HW;
832
833                                 skb_entail(sk, tp, skb);
834                                 copy = size_goal;
835                         }
836
837                         /* Try to append data to the end of skb. */
838                         if (copy > seglen)
839                                 copy = seglen;
840
841                         /* Where to copy to? */
842                         if (skb_tailroom(skb) > 0) {
843                                 /* We have some space in skb head. Superb! */
844                                 if (copy > skb_tailroom(skb))
845                                         copy = skb_tailroom(skb);
846                                 if ((err = skb_add_data(skb, from, copy)) != 0)
847                                         goto do_fault;
848                         } else {
849                                 int merge = 0;
850                                 int i = skb_shinfo(skb)->nr_frags;
851                                 struct page *page = TCP_PAGE(sk);
852                                 int off = TCP_OFF(sk);
853
854                                 if (skb_can_coalesce(skb, i, page, off) &&
855                                     off != PAGE_SIZE) {
856                                         /* We can extend the last page
857                                          * fragment. */
858                                         merge = 1;
859                                 } else if (i == MAX_SKB_FRAGS ||
860                                            (!i &&
861                                            !(sk->sk_route_caps & NETIF_F_SG))) {
862                                         /* Need to add new fragment and cannot
863                                          * do this because interface is non-SG,
864                                          * or because all the page slots are
865                                          * busy. */
866                                         tcp_mark_push(tp, skb);
867                                         goto new_segment;
868                                 } else if (page) {
869                                         if (off == PAGE_SIZE) {
870                                                 put_page(page);
871                                                 TCP_PAGE(sk) = page = NULL;
872                                         }
873                                 }
874
875                                 if (!page) {
876                                         /* Allocate new cache page. */
877                                         if (!(page = sk_stream_alloc_page(sk)))
878                                                 goto wait_for_memory;
879                                         off = 0;
880                                 }
881
882                                 if (copy > PAGE_SIZE - off)
883                                         copy = PAGE_SIZE - off;
884
885                                 /* Time to copy data. We are close to
886                                  * the end! */
887                                 err = skb_copy_to_page(sk, from, skb, page,
888                                                        off, copy);
889                                 if (err) {
890                                         /* If this page was new, give it to the
891                                          * socket so it does not get leaked.
892                                          */
893                                         if (!TCP_PAGE(sk)) {
894                                                 TCP_PAGE(sk) = page;
895                                                 TCP_OFF(sk) = 0;
896                                         }
897                                         goto do_error;
898                                 }
899
900                                 /* Update the skb. */
901                                 if (merge) {
902                                         skb_shinfo(skb)->frags[i - 1].size +=
903                                                                         copy;
904                                 } else {
905                                         skb_fill_page_desc(skb, i, page, off, copy);
906                                         if (TCP_PAGE(sk)) {
907                                                 get_page(page);
908                                         } else if (off + copy < PAGE_SIZE) {
909                                                 get_page(page);
910                                                 TCP_PAGE(sk) = page;
911                                         }
912                                 }
913
914                                 TCP_OFF(sk) = off + copy;
915                         }
916
917                         if (!copied)
918                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
919
920                         tp->write_seq += copy;
921                         TCP_SKB_CB(skb)->end_seq += copy;
922                         skb_shinfo(skb)->tso_segs = 0;
923
924                         from += copy;
925                         copied += copy;
926                         if ((seglen -= copy) == 0 && iovlen == 0)
927                                 goto out;
928
929                         if (skb->len < mss_now || (flags & MSG_OOB))
930                                 continue;
931
932                         if (forced_push(tp)) {
933                                 tcp_mark_push(tp, skb);
934                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
935                         } else if (skb == sk->sk_send_head)
936                                 tcp_push_one(sk, mss_now);
937                         continue;
938
939 wait_for_sndbuf:
940                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
941 wait_for_memory:
942                         if (copied)
943                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
944
945                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
946                                 goto do_error;
947
948                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
949                         size_goal = tp->xmit_size_goal;
950                 }
951         }
952
953 out:
954         if (copied)
955                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
956         TCP_CHECK_TIMER(sk);
957         release_sock(sk);
958         return copied;
959
960 do_fault:
961         if (!skb->len) {
962                 if (sk->sk_send_head == skb)
963                         sk->sk_send_head = NULL;
964                 __skb_unlink(skb, &sk->sk_write_queue);
965                 sk_stream_free_skb(sk, skb);
966         }
967
968 do_error:
969         if (copied)
970                 goto out;
971 out_err:
972         err = sk_stream_error(sk, flags, err);
973         TCP_CHECK_TIMER(sk);
974         release_sock(sk);
975         return err;
976 }
977
978 /*
979  *      Handle reading urgent data. BSD has very simple semantics for
980  *      this, no blocking and very strange errors 8)
981  */
982
983 static int tcp_recv_urg(struct sock *sk, long timeo,
984                         struct msghdr *msg, int len, int flags,
985                         int *addr_len)
986 {
987         struct tcp_sock *tp = tcp_sk(sk);
988
989         /* No URG data to read. */
990         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
991             tp->urg_data == TCP_URG_READ)
992                 return -EINVAL; /* Yes this is right ! */
993
994         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
995                 return -ENOTCONN;
996
997         if (tp->urg_data & TCP_URG_VALID) {
998                 int err = 0;
999                 char c = tp->urg_data;
1000
1001                 if (!(flags & MSG_PEEK))
1002                         tp->urg_data = TCP_URG_READ;
1003
1004                 /* Read urgent data. */
1005                 msg->msg_flags |= MSG_OOB;
1006
1007                 if (len > 0) {
1008                         if (!(flags & MSG_TRUNC))
1009                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1010                         len = 1;
1011                 } else
1012                         msg->msg_flags |= MSG_TRUNC;
1013
1014                 return err ? -EFAULT : len;
1015         }
1016
1017         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1018                 return 0;
1019
1020         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1021          * the available implementations agree in this case:
1022          * this call should never block, independent of the
1023          * blocking state of the socket.
1024          * Mike <pall@rz.uni-karlsruhe.de>
1025          */
1026         return -EAGAIN;
1027 }
1028
1029 /* Clean up the receive buffer for full frames taken by the user,
1030  * then send an ACK if necessary.  COPIED is the number of bytes
1031  * tcp_recvmsg has given to the user so far, it speeds up the
1032  * calculation of whether or not we must ACK for the sake of
1033  * a window update.
1034  */
1035 static void cleanup_rbuf(struct sock *sk, int copied)
1036 {
1037         struct tcp_sock *tp = tcp_sk(sk);
1038         int time_to_ack = 0;
1039
1040 #if TCP_DEBUG
1041         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1042
1043         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1044 #endif
1045
1046         if (inet_csk_ack_scheduled(sk)) {
1047                 const struct inet_connection_sock *icsk = inet_csk(sk);
1048                    /* Delayed ACKs frequently hit locked sockets during bulk
1049                     * receive. */
1050                 if (icsk->icsk_ack.blocked ||
1051                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1052                     tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1053                     /*
1054                      * If this read emptied read buffer, we send ACK, if
1055                      * connection is not bidirectional, user drained
1056                      * receive buffer and there was a small segment
1057                      * in queue.
1058                      */
1059                     (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1060                      !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1061                         time_to_ack = 1;
1062         }
1063
1064         /* We send an ACK if we can now advertise a non-zero window
1065          * which has been raised "significantly".
1066          *
1067          * Even if window raised up to infinity, do not send window open ACK
1068          * in states, where we will not receive more. It is useless.
1069          */
1070         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1071                 __u32 rcv_window_now = tcp_receive_window(tp);
1072
1073                 /* Optimize, __tcp_select_window() is not cheap. */
1074                 if (2*rcv_window_now <= tp->window_clamp) {
1075                         __u32 new_window = __tcp_select_window(sk);
1076
1077                         /* Send ACK now, if this read freed lots of space
1078                          * in our buffer. Certainly, new_window is new window.
1079                          * We can advertise it now, if it is not less than current one.
1080                          * "Lots" means "at least twice" here.
1081                          */
1082                         if (new_window && new_window >= 2 * rcv_window_now)
1083                                 time_to_ack = 1;
1084                 }
1085         }
1086         if (time_to_ack)
1087                 tcp_send_ack(sk);
1088 }
1089
1090 static void tcp_prequeue_process(struct sock *sk)
1091 {
1092         struct sk_buff *skb;
1093         struct tcp_sock *tp = tcp_sk(sk);
1094
1095         NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1096
1097         /* RX process wants to run with disabled BHs, though it is not
1098          * necessary */
1099         local_bh_disable();
1100         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1101                 sk->sk_backlog_rcv(sk, skb);
1102         local_bh_enable();
1103
1104         /* Clear memory counter. */
1105         tp->ucopy.memory = 0;
1106 }
1107
1108 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1109 {
1110         struct sk_buff *skb;
1111         u32 offset;
1112
1113         skb_queue_walk(&sk->sk_receive_queue, skb) {
1114                 offset = seq - TCP_SKB_CB(skb)->seq;
1115                 if (skb->h.th->syn)
1116                         offset--;
1117                 if (offset < skb->len || skb->h.th->fin) {
1118                         *off = offset;
1119                         return skb;
1120                 }
1121         }
1122         return NULL;
1123 }
1124
1125 /*
1126  * This routine provides an alternative to tcp_recvmsg() for routines
1127  * that would like to handle copying from skbuffs directly in 'sendfile'
1128  * fashion.
1129  * Note:
1130  *      - It is assumed that the socket was locked by the caller.
1131  *      - The routine does not block.
1132  *      - At present, there is no support for reading OOB data
1133  *        or for 'peeking' the socket using this routine
1134  *        (although both would be easy to implement).
1135  */
1136 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1137                   sk_read_actor_t recv_actor)
1138 {
1139         struct sk_buff *skb;
1140         struct tcp_sock *tp = tcp_sk(sk);
1141         u32 seq = tp->copied_seq;
1142         u32 offset;
1143         int copied = 0;
1144
1145         if (sk->sk_state == TCP_LISTEN)
1146                 return -ENOTCONN;
1147         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1148                 if (offset < skb->len) {
1149                         size_t used, len;
1150
1151                         len = skb->len - offset;
1152                         /* Stop reading if we hit a patch of urgent data */
1153                         if (tp->urg_data) {
1154                                 u32 urg_offset = tp->urg_seq - seq;
1155                                 if (urg_offset < len)
1156                                         len = urg_offset;
1157                                 if (!len)
1158                                         break;
1159                         }
1160                         used = recv_actor(desc, skb, offset, len);
1161                         if (used <= len) {
1162                                 seq += used;
1163                                 copied += used;
1164                                 offset += used;
1165                         }
1166                         if (offset != skb->len)
1167                                 break;
1168                 }
1169                 if (skb->h.th->fin) {
1170                         sk_eat_skb(sk, skb);
1171                         ++seq;
1172                         break;
1173                 }
1174                 sk_eat_skb(sk, skb);
1175                 if (!desc->count)
1176                         break;
1177         }
1178         tp->copied_seq = seq;
1179
1180         tcp_rcv_space_adjust(sk);
1181
1182         /* Clean up data we have read: This will do ACK frames. */
1183         if (copied)
1184                 cleanup_rbuf(sk, copied);
1185         return copied;
1186 }
1187
1188 /*
1189  *      This routine copies from a sock struct into the user buffer.
1190  *
1191  *      Technical note: in 2.3 we work on _locked_ socket, so that
1192  *      tricks with *seq access order and skb->users are not required.
1193  *      Probably, code can be easily improved even more.
1194  */
1195
1196 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1197                 size_t len, int nonblock, int flags, int *addr_len)
1198 {
1199         struct tcp_sock *tp = tcp_sk(sk);
1200         int copied = 0;
1201         u32 peek_seq;
1202         u32 *seq;
1203         unsigned long used;
1204         int err;
1205         int target;             /* Read at least this many bytes */
1206         long timeo;
1207         struct task_struct *user_recv = NULL;
1208
1209         lock_sock(sk);
1210
1211         TCP_CHECK_TIMER(sk);
1212
1213         err = -ENOTCONN;
1214         if (sk->sk_state == TCP_LISTEN)
1215                 goto out;
1216
1217         timeo = sock_rcvtimeo(sk, nonblock);
1218
1219         /* Urgent data needs to be handled specially. */
1220         if (flags & MSG_OOB)
1221                 goto recv_urg;
1222
1223         seq = &tp->copied_seq;
1224         if (flags & MSG_PEEK) {
1225                 peek_seq = tp->copied_seq;
1226                 seq = &peek_seq;
1227         }
1228
1229         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1230
1231         do {
1232                 struct sk_buff *skb;
1233                 u32 offset;
1234
1235                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1236                 if (tp->urg_data && tp->urg_seq == *seq) {
1237                         if (copied)
1238                                 break;
1239                         if (signal_pending(current)) {
1240                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1241                                 break;
1242                         }
1243                 }
1244
1245                 /* Next get a buffer. */
1246
1247                 skb = skb_peek(&sk->sk_receive_queue);
1248                 do {
1249                         if (!skb)
1250                                 break;
1251
1252                         /* Now that we have two receive queues this
1253                          * shouldn't happen.
1254                          */
1255                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1256                                 printk(KERN_INFO "recvmsg bug: copied %X "
1257                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1258                                 break;
1259                         }
1260                         offset = *seq - TCP_SKB_CB(skb)->seq;
1261                         if (skb->h.th->syn)
1262                                 offset--;
1263                         if (offset < skb->len)
1264                                 goto found_ok_skb;
1265                         if (skb->h.th->fin)
1266                                 goto found_fin_ok;
1267                         BUG_TRAP(flags & MSG_PEEK);
1268                         skb = skb->next;
1269                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1270
1271                 /* Well, if we have backlog, try to process it now yet. */
1272
1273                 if (copied >= target && !sk->sk_backlog.tail)
1274                         break;
1275
1276                 if (copied) {
1277                         if (sk->sk_err ||
1278                             sk->sk_state == TCP_CLOSE ||
1279                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1280                             !timeo ||
1281                             signal_pending(current) ||
1282                             (flags & MSG_PEEK))
1283                                 break;
1284                 } else {
1285                         if (sock_flag(sk, SOCK_DONE))
1286                                 break;
1287
1288                         if (sk->sk_err) {
1289                                 copied = sock_error(sk);
1290                                 break;
1291                         }
1292
1293                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1294                                 break;
1295
1296                         if (sk->sk_state == TCP_CLOSE) {
1297                                 if (!sock_flag(sk, SOCK_DONE)) {
1298                                         /* This occurs when user tries to read
1299                                          * from never connected socket.
1300                                          */
1301                                         copied = -ENOTCONN;
1302                                         break;
1303                                 }
1304                                 break;
1305                         }
1306
1307                         if (!timeo) {
1308                                 copied = -EAGAIN;
1309                                 break;
1310                         }
1311
1312                         if (signal_pending(current)) {
1313                                 copied = sock_intr_errno(timeo);
1314                                 break;
1315                         }
1316                 }
1317
1318                 cleanup_rbuf(sk, copied);
1319
1320                 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1321                         /* Install new reader */
1322                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1323                                 user_recv = current;
1324                                 tp->ucopy.task = user_recv;
1325                                 tp->ucopy.iov = msg->msg_iov;
1326                         }
1327
1328                         tp->ucopy.len = len;
1329
1330                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1331                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1332
1333                         /* Ugly... If prequeue is not empty, we have to
1334                          * process it before releasing socket, otherwise
1335                          * order will be broken at second iteration.
1336                          * More elegant solution is required!!!
1337                          *
1338                          * Look: we have the following (pseudo)queues:
1339                          *
1340                          * 1. packets in flight
1341                          * 2. backlog
1342                          * 3. prequeue
1343                          * 4. receive_queue
1344                          *
1345                          * Each queue can be processed only if the next ones
1346                          * are empty. At this point we have empty receive_queue.
1347                          * But prequeue _can_ be not empty after 2nd iteration,
1348                          * when we jumped to start of loop because backlog
1349                          * processing added something to receive_queue.
1350                          * We cannot release_sock(), because backlog contains
1351                          * packets arrived _after_ prequeued ones.
1352                          *
1353                          * Shortly, algorithm is clear --- to process all
1354                          * the queues in order. We could make it more directly,
1355                          * requeueing packets from backlog to prequeue, if
1356                          * is not empty. It is more elegant, but eats cycles,
1357                          * unfortunately.
1358                          */
1359                         if (!skb_queue_empty(&tp->ucopy.prequeue))
1360                                 goto do_prequeue;
1361
1362                         /* __ Set realtime policy in scheduler __ */
1363                 }
1364
1365                 if (copied >= target) {
1366                         /* Do not sleep, just process backlog. */
1367                         release_sock(sk);
1368                         lock_sock(sk);
1369                 } else
1370                         sk_wait_data(sk, &timeo);
1371
1372                 if (user_recv) {
1373                         int chunk;
1374
1375                         /* __ Restore normal policy in scheduler __ */
1376
1377                         if ((chunk = len - tp->ucopy.len) != 0) {
1378                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1379                                 len -= chunk;
1380                                 copied += chunk;
1381                         }
1382
1383                         if (tp->rcv_nxt == tp->copied_seq &&
1384                             !skb_queue_empty(&tp->ucopy.prequeue)) {
1385 do_prequeue:
1386                                 tcp_prequeue_process(sk);
1387
1388                                 if ((chunk = len - tp->ucopy.len) != 0) {
1389                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1390                                         len -= chunk;
1391                                         copied += chunk;
1392                                 }
1393                         }
1394                 }
1395                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1396                         if (net_ratelimit())
1397                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1398                                        current->comm, current->pid);
1399                         peek_seq = tp->copied_seq;
1400                 }
1401                 continue;
1402
1403         found_ok_skb:
1404                 /* Ok so how much can we use? */
1405                 used = skb->len - offset;
1406                 if (len < used)
1407                         used = len;
1408
1409                 /* Do we have urgent data here? */
1410                 if (tp->urg_data) {
1411                         u32 urg_offset = tp->urg_seq - *seq;
1412                         if (urg_offset < used) {
1413                                 if (!urg_offset) {
1414                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1415                                                 ++*seq;
1416                                                 offset++;
1417                                                 used--;
1418                                                 if (!used)
1419                                                         goto skip_copy;
1420                                         }
1421                                 } else
1422                                         used = urg_offset;
1423                         }
1424                 }
1425
1426                 if (!(flags & MSG_TRUNC)) {
1427                         err = skb_copy_datagram_iovec(skb, offset,
1428                                                       msg->msg_iov, used);
1429                         if (err) {
1430                                 /* Exception. Bailout! */
1431                                 if (!copied)
1432                                         copied = -EFAULT;
1433                                 break;
1434                         }
1435                 }
1436
1437                 *seq += used;
1438                 copied += used;
1439                 len -= used;
1440
1441                 tcp_rcv_space_adjust(sk);
1442
1443 skip_copy:
1444                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1445                         tp->urg_data = 0;
1446                         tcp_fast_path_check(sk, tp);
1447                 }
1448                 if (used + offset < skb->len)
1449                         continue;
1450
1451                 if (skb->h.th->fin)
1452                         goto found_fin_ok;
1453                 if (!(flags & MSG_PEEK))
1454                         sk_eat_skb(sk, skb);
1455                 continue;
1456
1457         found_fin_ok:
1458                 /* Process the FIN. */
1459                 ++*seq;
1460                 if (!(flags & MSG_PEEK))
1461                         sk_eat_skb(sk, skb);
1462                 break;
1463         } while (len > 0);
1464
1465         if (user_recv) {
1466                 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1467                         int chunk;
1468
1469                         tp->ucopy.len = copied > 0 ? len : 0;
1470
1471                         tcp_prequeue_process(sk);
1472
1473                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1474                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1475                                 len -= chunk;
1476                                 copied += chunk;
1477                         }
1478                 }
1479
1480                 tp->ucopy.task = NULL;
1481                 tp->ucopy.len = 0;
1482         }
1483
1484         /* According to UNIX98, msg_name/msg_namelen are ignored
1485          * on connected socket. I was just happy when found this 8) --ANK
1486          */
1487
1488         /* Clean up data we have read: This will do ACK frames. */
1489         cleanup_rbuf(sk, copied);
1490
1491         TCP_CHECK_TIMER(sk);
1492         release_sock(sk);
1493         return copied;
1494
1495 out:
1496         TCP_CHECK_TIMER(sk);
1497         release_sock(sk);
1498         return err;
1499
1500 recv_urg:
1501         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1502         goto out;
1503 }
1504
1505 /*
1506  *      State processing on a close. This implements the state shift for
1507  *      sending our FIN frame. Note that we only send a FIN for some
1508  *      states. A shutdown() may have already sent the FIN, or we may be
1509  *      closed.
1510  */
1511
1512 static unsigned char new_state[16] = {
1513   /* current state:        new state:      action:      */
1514   /* (Invalid)          */ TCP_CLOSE,
1515   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1516   /* TCP_SYN_SENT       */ TCP_CLOSE,
1517   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1518   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1519   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1520   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1521   /* TCP_CLOSE          */ TCP_CLOSE,
1522   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1523   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1524   /* TCP_LISTEN         */ TCP_CLOSE,
1525   /* TCP_CLOSING        */ TCP_CLOSING,
1526 };
1527
1528 static int tcp_close_state(struct sock *sk)
1529 {
1530         int next = (int)new_state[sk->sk_state];
1531         int ns = next & TCP_STATE_MASK;
1532
1533         tcp_set_state(sk, ns);
1534
1535         return next & TCP_ACTION_FIN;
1536 }
1537
1538 /*
1539  *      Shutdown the sending side of a connection. Much like close except
1540  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1541  */
1542
1543 void tcp_shutdown(struct sock *sk, int how)
1544 {
1545         /*      We need to grab some memory, and put together a FIN,
1546          *      and then put it into the queue to be sent.
1547          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1548          */
1549         if (!(how & SEND_SHUTDOWN))
1550                 return;
1551
1552         /* If we've already sent a FIN, or it's a closed state, skip this. */
1553         if ((1 << sk->sk_state) &
1554             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1555              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1556                 /* Clear out any half completed packets.  FIN if needed. */
1557                 if (tcp_close_state(sk))
1558                         tcp_send_fin(sk);
1559         }
1560 }
1561
1562 /*
1563  * At this point, there should be no process reference to this
1564  * socket, and thus no user references at all.  Therefore we
1565  * can assume the socket waitqueue is inactive and nobody will
1566  * try to jump onto it.
1567  */
1568 void inet_csk_destroy_sock(struct sock *sk)
1569 {
1570         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1571         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1572
1573         /* It cannot be in hash table! */
1574         BUG_TRAP(sk_unhashed(sk));
1575
1576         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1577         BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash);
1578
1579         sk->sk_prot->destroy(sk);
1580
1581         sk_stream_kill_queues(sk);
1582
1583         xfrm_sk_free_policy(sk);
1584
1585         sk_refcnt_debug_release(sk);
1586
1587         atomic_dec(sk->sk_prot->orphan_count);
1588         sock_put(sk);
1589 }
1590
1591 void tcp_close(struct sock *sk, long timeout)
1592 {
1593         struct sk_buff *skb;
1594         int data_was_unread = 0;
1595
1596         lock_sock(sk);
1597         sk->sk_shutdown = SHUTDOWN_MASK;
1598
1599         if (sk->sk_state == TCP_LISTEN) {
1600                 tcp_set_state(sk, TCP_CLOSE);
1601
1602                 /* Special case. */
1603                 inet_csk_listen_stop(sk);
1604
1605                 goto adjudge_to_death;
1606         }
1607
1608         /*  We need to flush the recv. buffs.  We do this only on the
1609          *  descriptor close, not protocol-sourced closes, because the
1610          *  reader process may not have drained the data yet!
1611          */
1612         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1613                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1614                           skb->h.th->fin;
1615                 data_was_unread += len;
1616                 __kfree_skb(skb);
1617         }
1618
1619         sk_stream_mem_reclaim(sk);
1620
1621         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1622          * 3.10, we send a RST here because data was lost.  To
1623          * witness the awful effects of the old behavior of always
1624          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1625          * a bulk GET in an FTP client, suspend the process, wait
1626          * for the client to advertise a zero window, then kill -9
1627          * the FTP client, wheee...  Note: timeout is always zero
1628          * in such a case.
1629          */
1630         if (data_was_unread) {
1631                 /* Unread data was tossed, zap the connection. */
1632                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1633                 tcp_set_state(sk, TCP_CLOSE);
1634                 tcp_send_active_reset(sk, GFP_KERNEL);
1635         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1636                 /* Check zero linger _after_ checking for unread data. */
1637                 sk->sk_prot->disconnect(sk, 0);
1638                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1639         } else if (tcp_close_state(sk)) {
1640                 /* We FIN if the application ate all the data before
1641                  * zapping the connection.
1642                  */
1643
1644                 /* RED-PEN. Formally speaking, we have broken TCP state
1645                  * machine. State transitions:
1646                  *
1647                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1648                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1649                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1650                  *
1651                  * are legal only when FIN has been sent (i.e. in window),
1652                  * rather than queued out of window. Purists blame.
1653                  *
1654                  * F.e. "RFC state" is ESTABLISHED,
1655                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1656                  *
1657                  * The visible declinations are that sometimes
1658                  * we enter time-wait state, when it is not required really
1659                  * (harmless), do not send active resets, when they are
1660                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1661                  * they look as CLOSING or LAST_ACK for Linux)
1662                  * Probably, I missed some more holelets.
1663                  *                                              --ANK
1664                  */
1665                 tcp_send_fin(sk);
1666         }
1667
1668         sk_stream_wait_close(sk, timeout);
1669
1670 adjudge_to_death:
1671         /* It is the last release_sock in its life. It will remove backlog. */
1672         release_sock(sk);
1673
1674
1675         /* Now socket is owned by kernel and we acquire BH lock
1676            to finish close. No need to check for user refs.
1677          */
1678         local_bh_disable();
1679         bh_lock_sock(sk);
1680         BUG_TRAP(!sock_owned_by_user(sk));
1681
1682         sock_hold(sk);
1683         sock_orphan(sk);
1684
1685         /*      This is a (useful) BSD violating of the RFC. There is a
1686          *      problem with TCP as specified in that the other end could
1687          *      keep a socket open forever with no application left this end.
1688          *      We use a 3 minute timeout (about the same as BSD) then kill
1689          *      our end. If they send after that then tough - BUT: long enough
1690          *      that we won't make the old 4*rto = almost no time - whoops
1691          *      reset mistake.
1692          *
1693          *      Nope, it was not mistake. It is really desired behaviour
1694          *      f.e. on http servers, when such sockets are useless, but
1695          *      consume significant resources. Let's do it with special
1696          *      linger2 option.                                 --ANK
1697          */
1698
1699         if (sk->sk_state == TCP_FIN_WAIT2) {
1700                 struct tcp_sock *tp = tcp_sk(sk);
1701                 if (tp->linger2 < 0) {
1702                         tcp_set_state(sk, TCP_CLOSE);
1703                         tcp_send_active_reset(sk, GFP_ATOMIC);
1704                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1705                 } else {
1706                         const int tmo = tcp_fin_time(sk);
1707
1708                         if (tmo > TCP_TIMEWAIT_LEN) {
1709                                 inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
1710                         } else {
1711                                 atomic_inc(sk->sk_prot->orphan_count);
1712                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1713                                 goto out;
1714                         }
1715                 }
1716         }
1717         if (sk->sk_state != TCP_CLOSE) {
1718                 sk_stream_mem_reclaim(sk);
1719                 if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
1720                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1721                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1722                         if (net_ratelimit())
1723                                 printk(KERN_INFO "TCP: too many of orphaned "
1724                                        "sockets\n");
1725                         tcp_set_state(sk, TCP_CLOSE);
1726                         tcp_send_active_reset(sk, GFP_ATOMIC);
1727                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1728                 }
1729         }
1730         atomic_inc(sk->sk_prot->orphan_count);
1731
1732         if (sk->sk_state == TCP_CLOSE)
1733                 inet_csk_destroy_sock(sk);
1734         /* Otherwise, socket is reprieved until protocol close. */
1735
1736 out:
1737         bh_unlock_sock(sk);
1738         local_bh_enable();
1739         sock_put(sk);
1740 }
1741
1742 /* These states need RST on ABORT according to RFC793 */
1743
1744 static inline int tcp_need_reset(int state)
1745 {
1746         return (1 << state) &
1747                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1748                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1749 }
1750
1751 int tcp_disconnect(struct sock *sk, int flags)
1752 {
1753         struct inet_sock *inet = inet_sk(sk);
1754         struct inet_connection_sock *icsk = inet_csk(sk);
1755         struct tcp_sock *tp = tcp_sk(sk);
1756         int err = 0;
1757         int old_state = sk->sk_state;
1758
1759         if (old_state != TCP_CLOSE)
1760                 tcp_set_state(sk, TCP_CLOSE);
1761
1762         /* ABORT function of RFC793 */
1763         if (old_state == TCP_LISTEN) {
1764                 inet_csk_listen_stop(sk);
1765         } else if (tcp_need_reset(old_state) ||
1766                    (tp->snd_nxt != tp->write_seq &&
1767                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1768                 /* The last check adjusts for discrepance of Linux wrt. RFC
1769                  * states
1770                  */
1771                 tcp_send_active_reset(sk, gfp_any());
1772                 sk->sk_err = ECONNRESET;
1773         } else if (old_state == TCP_SYN_SENT)
1774                 sk->sk_err = ECONNRESET;
1775
1776         tcp_clear_xmit_timers(sk);
1777         __skb_queue_purge(&sk->sk_receive_queue);
1778         sk_stream_writequeue_purge(sk);
1779         __skb_queue_purge(&tp->out_of_order_queue);
1780
1781         inet->dport = 0;
1782
1783         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1784                 inet_reset_saddr(sk);
1785
1786         sk->sk_shutdown = 0;
1787         sock_reset_flag(sk, SOCK_DONE);
1788         tp->srtt = 0;
1789         if ((tp->write_seq += tp->max_window + 2) == 0)
1790                 tp->write_seq = 1;
1791         icsk->icsk_backoff = 0;
1792         tp->snd_cwnd = 2;
1793         tp->probes_out = 0;
1794         tp->packets_out = 0;
1795         tp->snd_ssthresh = 0x7fffffff;
1796         tp->snd_cwnd_cnt = 0;
1797         tcp_set_ca_state(tp, TCP_CA_Open);
1798         tcp_clear_retrans(tp);
1799         inet_csk_delack_init(sk);
1800         sk->sk_send_head = NULL;
1801         tp->rx_opt.saw_tstamp = 0;
1802         tcp_sack_reset(&tp->rx_opt);
1803         __sk_dst_reset(sk);
1804
1805         BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
1806
1807         sk->sk_error_report(sk);
1808         return err;
1809 }
1810
1811 /*
1812  *      Socket option code for TCP.
1813  */
1814 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1815                    int optlen)
1816 {
1817         struct tcp_sock *tp = tcp_sk(sk);
1818         struct inet_connection_sock *icsk = inet_csk(sk);
1819         int val;
1820         int err = 0;
1821
1822         if (level != SOL_TCP)
1823                 return tp->af_specific->setsockopt(sk, level, optname,
1824                                                    optval, optlen);
1825
1826         /* This is a string value all the others are int's */
1827         if (optname == TCP_CONGESTION) {
1828                 char name[TCP_CA_NAME_MAX];
1829
1830                 if (optlen < 1)
1831                         return -EINVAL;
1832
1833                 val = strncpy_from_user(name, optval,
1834                                         min(TCP_CA_NAME_MAX-1, optlen));
1835                 if (val < 0)
1836                         return -EFAULT;
1837                 name[val] = 0;
1838
1839                 lock_sock(sk);
1840                 err = tcp_set_congestion_control(tp, name);
1841                 release_sock(sk);
1842                 return err;
1843         }
1844
1845         if (optlen < sizeof(int))
1846                 return -EINVAL;
1847
1848         if (get_user(val, (int __user *)optval))
1849                 return -EFAULT;
1850
1851         lock_sock(sk);
1852
1853         switch (optname) {
1854         case TCP_MAXSEG:
1855                 /* Values greater than interface MTU won't take effect. However
1856                  * at the point when this call is done we typically don't yet
1857                  * know which interface is going to be used */
1858                 if (val < 8 || val > MAX_TCP_WINDOW) {
1859                         err = -EINVAL;
1860                         break;
1861                 }
1862                 tp->rx_opt.user_mss = val;
1863                 break;
1864
1865         case TCP_NODELAY:
1866                 if (val) {
1867                         /* TCP_NODELAY is weaker than TCP_CORK, so that
1868                          * this option on corked socket is remembered, but
1869                          * it is not activated until cork is cleared.
1870                          *
1871                          * However, when TCP_NODELAY is set we make
1872                          * an explicit push, which overrides even TCP_CORK
1873                          * for currently queued segments.
1874                          */
1875                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1876                         tcp_push_pending_frames(sk, tp);
1877                 } else {
1878                         tp->nonagle &= ~TCP_NAGLE_OFF;
1879                 }
1880                 break;
1881
1882         case TCP_CORK:
1883                 /* When set indicates to always queue non-full frames.
1884                  * Later the user clears this option and we transmit
1885                  * any pending partial frames in the queue.  This is
1886                  * meant to be used alongside sendfile() to get properly
1887                  * filled frames when the user (for example) must write
1888                  * out headers with a write() call first and then use
1889                  * sendfile to send out the data parts.
1890                  *
1891                  * TCP_CORK can be set together with TCP_NODELAY and it is
1892                  * stronger than TCP_NODELAY.
1893                  */
1894                 if (val) {
1895                         tp->nonagle |= TCP_NAGLE_CORK;
1896                 } else {
1897                         tp->nonagle &= ~TCP_NAGLE_CORK;
1898                         if (tp->nonagle&TCP_NAGLE_OFF)
1899                                 tp->nonagle |= TCP_NAGLE_PUSH;
1900                         tcp_push_pending_frames(sk, tp);
1901                 }
1902                 break;
1903
1904         case TCP_KEEPIDLE:
1905                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
1906                         err = -EINVAL;
1907                 else {
1908                         tp->keepalive_time = val * HZ;
1909                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
1910                             !((1 << sk->sk_state) &
1911                               (TCPF_CLOSE | TCPF_LISTEN))) {
1912                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
1913                                 if (tp->keepalive_time > elapsed)
1914                                         elapsed = tp->keepalive_time - elapsed;
1915                                 else
1916                                         elapsed = 0;
1917                                 inet_csk_reset_keepalive_timer(sk, elapsed);
1918                         }
1919                 }
1920                 break;
1921         case TCP_KEEPINTVL:
1922                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
1923                         err = -EINVAL;
1924                 else
1925                         tp->keepalive_intvl = val * HZ;
1926                 break;
1927         case TCP_KEEPCNT:
1928                 if (val < 1 || val > MAX_TCP_KEEPCNT)
1929                         err = -EINVAL;
1930                 else
1931                         tp->keepalive_probes = val;
1932                 break;
1933         case TCP_SYNCNT:
1934                 if (val < 1 || val > MAX_TCP_SYNCNT)
1935                         err = -EINVAL;
1936                 else
1937                         icsk->icsk_syn_retries = val;
1938                 break;
1939
1940         case TCP_LINGER2:
1941                 if (val < 0)
1942                         tp->linger2 = -1;
1943                 else if (val > sysctl_tcp_fin_timeout / HZ)
1944                         tp->linger2 = 0;
1945                 else
1946                         tp->linger2 = val * HZ;
1947                 break;
1948
1949         case TCP_DEFER_ACCEPT:
1950                 tp->defer_accept = 0;
1951                 if (val > 0) {
1952                         /* Translate value in seconds to number of
1953                          * retransmits */
1954                         while (tp->defer_accept < 32 &&
1955                                val > ((TCP_TIMEOUT_INIT / HZ) <<
1956                                        tp->defer_accept))
1957                                 tp->defer_accept++;
1958                         tp->defer_accept++;
1959                 }
1960                 break;
1961
1962         case TCP_WINDOW_CLAMP:
1963                 if (!val) {
1964                         if (sk->sk_state != TCP_CLOSE) {
1965                                 err = -EINVAL;
1966                                 break;
1967                         }
1968                         tp->window_clamp = 0;
1969                 } else
1970                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
1971                                                 SOCK_MIN_RCVBUF / 2 : val;
1972                 break;
1973
1974         case TCP_QUICKACK:
1975                 if (!val) {
1976                         icsk->icsk_ack.pingpong = 1;
1977                 } else {
1978                         icsk->icsk_ack.pingpong = 0;
1979                         if ((1 << sk->sk_state) &
1980                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
1981                             inet_csk_ack_scheduled(sk)) {
1982                                 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
1983                                 cleanup_rbuf(sk, 1);
1984                                 if (!(val & 1))
1985                                         icsk->icsk_ack.pingpong = 1;
1986                         }
1987                 }
1988                 break;
1989
1990         default:
1991                 err = -ENOPROTOOPT;
1992                 break;
1993         };
1994         release_sock(sk);
1995         return err;
1996 }
1997
1998 /* Return information about state of tcp endpoint in API format. */
1999 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2000 {
2001         struct tcp_sock *tp = tcp_sk(sk);
2002         const struct inet_connection_sock *icsk = inet_csk(sk);
2003         u32 now = tcp_time_stamp;
2004
2005         memset(info, 0, sizeof(*info));
2006
2007         info->tcpi_state = sk->sk_state;
2008         info->tcpi_ca_state = tp->ca_state;
2009         info->tcpi_retransmits = icsk->icsk_retransmits;
2010         info->tcpi_probes = tp->probes_out;
2011         info->tcpi_backoff = icsk->icsk_backoff;
2012
2013         if (tp->rx_opt.tstamp_ok)
2014                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2015         if (tp->rx_opt.sack_ok)
2016                 info->tcpi_options |= TCPI_OPT_SACK;
2017         if (tp->rx_opt.wscale_ok) {
2018                 info->tcpi_options |= TCPI_OPT_WSCALE;
2019                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2020                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2021         } 
2022
2023         if (tp->ecn_flags&TCP_ECN_OK)
2024                 info->tcpi_options |= TCPI_OPT_ECN;
2025
2026         info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2027         info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2028         info->tcpi_snd_mss = tp->mss_cache;
2029         info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2030
2031         info->tcpi_unacked = tp->packets_out;
2032         info->tcpi_sacked = tp->sacked_out;
2033         info->tcpi_lost = tp->lost_out;
2034         info->tcpi_retrans = tp->retrans_out;
2035         info->tcpi_fackets = tp->fackets_out;
2036
2037         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2038         info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2039         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2040
2041         info->tcpi_pmtu = tp->pmtu_cookie;
2042         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2043         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2044         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2045         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2046         info->tcpi_snd_cwnd = tp->snd_cwnd;
2047         info->tcpi_advmss = tp->advmss;
2048         info->tcpi_reordering = tp->reordering;
2049
2050         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2051         info->tcpi_rcv_space = tp->rcvq_space.space;
2052
2053         info->tcpi_total_retrans = tp->total_retrans;
2054 }
2055
2056 EXPORT_SYMBOL_GPL(tcp_get_info);
2057
2058 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2059                    int __user *optlen)
2060 {
2061         struct tcp_sock *tp = tcp_sk(sk);
2062         int val, len;
2063
2064         if (level != SOL_TCP)
2065                 return tp->af_specific->getsockopt(sk, level, optname,
2066                                                    optval, optlen);
2067
2068         if (get_user(len, optlen))
2069                 return -EFAULT;
2070
2071         len = min_t(unsigned int, len, sizeof(int));
2072
2073         if (len < 0)
2074                 return -EINVAL;
2075
2076         switch (optname) {
2077         case TCP_MAXSEG:
2078                 val = tp->mss_cache;
2079                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2080                         val = tp->rx_opt.user_mss;
2081                 break;
2082         case TCP_NODELAY:
2083                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2084                 break;
2085         case TCP_CORK:
2086                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2087                 break;
2088         case TCP_KEEPIDLE:
2089                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2090                 break;
2091         case TCP_KEEPINTVL:
2092                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2093                 break;
2094         case TCP_KEEPCNT:
2095                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2096                 break;
2097         case TCP_SYNCNT:
2098                 val = inet_csk(sk)->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2099                 break;
2100         case TCP_LINGER2:
2101                 val = tp->linger2;
2102                 if (val >= 0)
2103                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2104                 break;
2105         case TCP_DEFER_ACCEPT:
2106                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2107                                                (tp->defer_accept - 1));
2108                 break;
2109         case TCP_WINDOW_CLAMP:
2110                 val = tp->window_clamp;
2111                 break;
2112         case TCP_INFO: {
2113                 struct tcp_info info;
2114
2115                 if (get_user(len, optlen))
2116                         return -EFAULT;
2117
2118                 tcp_get_info(sk, &info);
2119
2120                 len = min_t(unsigned int, len, sizeof(info));
2121                 if (put_user(len, optlen))
2122                         return -EFAULT;
2123                 if (copy_to_user(optval, &info, len))
2124                         return -EFAULT;
2125                 return 0;
2126         }
2127         case TCP_QUICKACK:
2128                 val = !inet_csk(sk)->icsk_ack.pingpong;
2129                 break;
2130
2131         case TCP_CONGESTION:
2132                 if (get_user(len, optlen))
2133                         return -EFAULT;
2134                 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2135                 if (put_user(len, optlen))
2136                         return -EFAULT;
2137                 if (copy_to_user(optval, tp->ca_ops->name, len))
2138                         return -EFAULT;
2139                 return 0;
2140         default:
2141                 return -ENOPROTOOPT;
2142         };
2143
2144         if (put_user(len, optlen))
2145                 return -EFAULT;
2146         if (copy_to_user(optval, &val, len))
2147                 return -EFAULT;
2148         return 0;
2149 }
2150
2151
2152 extern void __skb_cb_too_small_for_tcp(int, int);
2153 extern struct tcp_congestion_ops tcp_reno;
2154
2155 static __initdata unsigned long thash_entries;
2156 static int __init set_thash_entries(char *str)
2157 {
2158         if (!str)
2159                 return 0;
2160         thash_entries = simple_strtoul(str, &str, 0);
2161         return 1;
2162 }
2163 __setup("thash_entries=", set_thash_entries);
2164
2165 void __init tcp_init(void)
2166 {
2167         struct sk_buff *skb = NULL;
2168         int order, i;
2169
2170         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2171                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2172                                            sizeof(skb->cb));
2173
2174         tcp_hashinfo.bind_bucket_cachep =
2175                 kmem_cache_create("tcp_bind_bucket",
2176                                   sizeof(struct inet_bind_bucket), 0,
2177                                   SLAB_HWCACHE_ALIGN, NULL, NULL);
2178         if (!tcp_hashinfo.bind_bucket_cachep)
2179                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2180
2181         /* Size and allocate the main established and bind bucket
2182          * hash tables.
2183          *
2184          * The methodology is similar to that of the buffer cache.
2185          */
2186         tcp_hashinfo.ehash =
2187                 alloc_large_system_hash("TCP established",
2188                                         sizeof(struct inet_ehash_bucket),
2189                                         thash_entries,
2190                                         (num_physpages >= 128 * 1024) ?
2191                                                 (25 - PAGE_SHIFT) :
2192                                                 (27 - PAGE_SHIFT),
2193                                         HASH_HIGHMEM,
2194                                         &tcp_hashinfo.ehash_size,
2195                                         NULL,
2196                                         0);
2197         tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
2198         for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
2199                 rwlock_init(&tcp_hashinfo.ehash[i].lock);
2200                 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2201         }
2202
2203         tcp_hashinfo.bhash =
2204                 alloc_large_system_hash("TCP bind",
2205                                         sizeof(struct inet_bind_hashbucket),
2206                                         tcp_hashinfo.ehash_size,
2207                                         (num_physpages >= 128 * 1024) ?
2208                                                 (25 - PAGE_SHIFT) :
2209                                                 (27 - PAGE_SHIFT),
2210                                         HASH_HIGHMEM,
2211                                         &tcp_hashinfo.bhash_size,
2212                                         NULL,
2213                                         64 * 1024);
2214         tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2215         for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2216                 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2217                 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2218         }
2219
2220         /* Try to be a bit smarter and adjust defaults depending
2221          * on available memory.
2222          */
2223         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2224                         (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2225                         order++)
2226                 ;
2227         if (order >= 4) {
2228                 sysctl_local_port_range[0] = 32768;
2229                 sysctl_local_port_range[1] = 61000;
2230                 sysctl_tcp_max_tw_buckets = 180000;
2231                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2232                 sysctl_max_syn_backlog = 1024;
2233         } else if (order < 3) {
2234                 sysctl_local_port_range[0] = 1024 * (3 - order);
2235                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2236                 sysctl_tcp_max_orphans >>= (3 - order);
2237                 sysctl_max_syn_backlog = 128;
2238         }
2239         tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
2240
2241         sysctl_tcp_mem[0] =  768 << order;
2242         sysctl_tcp_mem[1] = 1024 << order;
2243         sysctl_tcp_mem[2] = 1536 << order;
2244
2245         if (order < 3) {
2246                 sysctl_tcp_wmem[2] = 64 * 1024;
2247                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2248                 sysctl_tcp_rmem[1] = 43689;
2249                 sysctl_tcp_rmem[2] = 2 * 43689;
2250         }
2251
2252         printk(KERN_INFO "TCP: Hash tables configured "
2253                "(established %d bind %d)\n",
2254                tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
2255
2256         tcp_register_congestion_control(&tcp_reno);
2257 }
2258
2259 EXPORT_SYMBOL(tcp_close);
2260 EXPORT_SYMBOL(inet_csk_destroy_sock);
2261 EXPORT_SYMBOL(tcp_disconnect);
2262 EXPORT_SYMBOL(tcp_getsockopt);
2263 EXPORT_SYMBOL(tcp_ioctl);
2264 EXPORT_SYMBOL(tcp_poll);
2265 EXPORT_SYMBOL(tcp_read_sock);
2266 EXPORT_SYMBOL(tcp_recvmsg);
2267 EXPORT_SYMBOL(tcp_sendmsg);
2268 EXPORT_SYMBOL(tcp_sendpage);
2269 EXPORT_SYMBOL(tcp_setsockopt);
2270 EXPORT_SYMBOL(tcp_shutdown);
2271 EXPORT_SYMBOL(tcp_statistics);