[I/OAT]: TCP recv offload to I/OAT
[safe/jmp/linux-2.6] / net / ipv4 / tcp.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *              Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *              Alan Cox        :       Numerous verify_area() calls
24  *              Alan Cox        :       Set the ACK bit on a reset
25  *              Alan Cox        :       Stopped it crashing if it closed while
26  *                                      sk->inuse=1 and was trying to connect
27  *                                      (tcp_err()).
28  *              Alan Cox        :       All icmp error handling was broken
29  *                                      pointers passed where wrong and the
30  *                                      socket was looked up backwards. Nobody
31  *                                      tested any icmp error code obviously.
32  *              Alan Cox        :       tcp_err() now handled properly. It
33  *                                      wakes people on errors. poll
34  *                                      behaves and the icmp error race
35  *                                      has gone by moving it into sock.c
36  *              Alan Cox        :       tcp_send_reset() fixed to work for
37  *                                      everything not just packets for
38  *                                      unknown sockets.
39  *              Alan Cox        :       tcp option processing.
40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
41  *                                      syn rule wrong]
42  *              Herp Rosmanith  :       More reset fixes
43  *              Alan Cox        :       No longer acks invalid rst frames.
44  *                                      Acking any kind of RST is right out.
45  *              Alan Cox        :       Sets an ignore me flag on an rst
46  *                                      receive otherwise odd bits of prattle
47  *                                      escape still
48  *              Alan Cox        :       Fixed another acking RST frame bug.
49  *                                      Should stop LAN workplace lockups.
50  *              Alan Cox        :       Some tidyups using the new skb list
51  *                                      facilities
52  *              Alan Cox        :       sk->keepopen now seems to work
53  *              Alan Cox        :       Pulls options out correctly on accepts
54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
56  *                                      bit to skb ops.
57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
58  *                                      nasty.
59  *              Alan Cox        :       Added some better commenting, as the
60  *                                      tcp is hard to follow
61  *              Alan Cox        :       Removed incorrect check for 20 * psh
62  *      Michael O'Reilly        :       ack < copied bug fix.
63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
64  *              Alan Cox        :       FIN with no memory -> CRASH
65  *              Alan Cox        :       Added socket option proto entries.
66  *                                      Also added awareness of them to accept.
67  *              Alan Cox        :       Added TCP options (SOL_TCP)
68  *              Alan Cox        :       Switched wakeup calls to callbacks,
69  *                                      so the kernel can layer network
70  *                                      sockets.
71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
73  *              Alan Cox        :       RST frames sent on unsynchronised
74  *                                      state ack error.
75  *              Alan Cox        :       Put in missing check for SYN bit.
76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
77  *                                      window non shrink trick.
78  *              Alan Cox        :       Added a couple of small NET2E timer
79  *                                      fixes
80  *              Charles Hedrick :       TCP fixes
81  *              Toomas Tamm     :       TCP window fixes
82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
83  *              Charles Hedrick :       Rewrote most of it to actually work
84  *              Linus           :       Rewrote tcp_read() and URG handling
85  *                                      completely
86  *              Gerhard Koerting:       Fixed some missing timer handling
87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
88  *              Gerhard Koerting:       PC/TCP workarounds
89  *              Adam Caldwell   :       Assorted timer/timing errors
90  *              Matthew Dillon  :       Fixed another RST bug
91  *              Alan Cox        :       Move to kernel side addressing changes.
92  *              Alan Cox        :       Beginning work on TCP fastpathing
93  *                                      (not yet usable)
94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
95  *              Alan Cox        :       TCP fast path debugging
96  *              Alan Cox        :       Window clamping
97  *              Michael Riepe   :       Bug in tcp_check()
98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
99  *              Matt Dillon     :       Yet more small nasties remove from the
100  *                                      TCP code (Be very nice to this man if
101  *                                      tcp finally works 100%) 8)
102  *              Alan Cox        :       BSD accept semantics.
103  *              Alan Cox        :       Reset on closedown bug.
104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105  *              Michael Pall    :       Handle poll() after URG properly in
106  *                                      all cases.
107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108  *                                      (multi URG PUSH broke rlogin).
109  *              Michael Pall    :       Fix the multi URG PUSH problem in
110  *                                      tcp_readable(), poll() after URG
111  *                                      works now.
112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113  *                                      BSD api.
114  *              Alan Cox        :       Changed the semantics of sk->socket to
115  *                                      fix a race and a signal problem with
116  *                                      accept() and async I/O.
117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120  *                                      clients/servers which listen in on
121  *                                      fixed ports.
122  *              Alan Cox        :       Cleaned the above up and shrank it to
123  *                                      a sensible code size.
124  *              Alan Cox        :       Self connect lockup fix.
125  *              Alan Cox        :       No connect to multicast.
126  *              Ross Biro       :       Close unaccepted children on master
127  *                                      socket close.
128  *              Alan Cox        :       Reset tracing code.
129  *              Alan Cox        :       Spurious resets on shutdown.
130  *              Alan Cox        :       Giant 15 minute/60 second timer error
131  *              Alan Cox        :       Small whoops in polling before an
132  *                                      accept.
133  *              Alan Cox        :       Kept the state trace facility since
134  *                                      it's handy for debugging.
135  *              Alan Cox        :       More reset handler fixes.
136  *              Alan Cox        :       Started rewriting the code based on
137  *                                      the RFC's for other useful protocol
138  *                                      references see: Comer, KA9Q NOS, and
139  *                                      for a reference on the difference
140  *                                      between specifications and how BSD
141  *                                      works see the 4.4lite source.
142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143  *                                      close.
144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146  *              Alan Cox        :       Reimplemented timers as per the RFC
147  *                                      and using multiple timers for sanity.
148  *              Alan Cox        :       Small bug fixes, and a lot of new
149  *                                      comments.
150  *              Alan Cox        :       Fixed dual reader crash by locking
151  *                                      the buffers (much like datagram.c)
152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153  *                                      now gets fed up of retrying without
154  *                                      (even a no space) answer.
155  *              Alan Cox        :       Extracted closing code better
156  *              Alan Cox        :       Fixed the closing state machine to
157  *                                      resemble the RFC.
158  *              Alan Cox        :       More 'per spec' fixes.
159  *              Jorge Cwik      :       Even faster checksumming.
160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161  *                                      only frames. At least one pc tcp stack
162  *                                      generates them.
163  *              Alan Cox        :       Cache last socket.
164  *              Alan Cox        :       Per route irtt.
165  *              Matt Day        :       poll()->select() match BSD precisely on error
166  *              Alan Cox        :       New buffers
167  *              Marc Tamsky     :       Various sk->prot->retransmits and
168  *                                      sk->retransmits misupdating fixed.
169  *                                      Fixed tcp_write_timeout: stuck close,
170  *                                      and TCP syn retries gets used now.
171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172  *                                      ack if state is TCP_CLOSED.
173  *              Alan Cox        :       Look up device on a retransmit - routes may
174  *                                      change. Doesn't yet cope with MSS shrink right
175  *                                      but it's a start!
176  *              Marc Tamsky     :       Closing in closing fixes.
177  *              Mike Shaver     :       RFC1122 verifications.
178  *              Alan Cox        :       rcv_saddr errors.
179  *              Alan Cox        :       Block double connect().
180  *              Alan Cox        :       Small hooks for enSKIP.
181  *              Alexey Kuznetsov:       Path MTU discovery.
182  *              Alan Cox        :       Support soft errors.
183  *              Alan Cox        :       Fix MTU discovery pathological case
184  *                                      when the remote claims no mtu!
185  *              Marc Tamsky     :       TCP_CLOSE fix.
186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187  *                                      window but wrong (fixes NT lpd problems)
188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
189  *              Joerg Reuter    :       No modification of locked buffers in
190  *                                      tcp_do_retransmit()
191  *              Eric Schenk     :       Changed receiver side silly window
192  *                                      avoidance algorithm to BSD style
193  *                                      algorithm. This doubles throughput
194  *                                      against machines running Solaris,
195  *                                      and seems to result in general
196  *                                      improvement.
197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
198  *      Willy Konynenberg       :       Transparent proxying support.
199  *      Mike McLagan            :       Routing by source
200  *              Keith Owens     :       Do proper merging with partial SKB's in
201  *                                      tcp_do_sendmsg to avoid burstiness.
202  *              Eric Schenk     :       Fix fast close down bug with
203  *                                      shutdown() followed by close().
204  *              Andi Kleen      :       Make poll agree with SIGIO
205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
206  *                                      lingertime == 0 (RFC 793 ABORT Call)
207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
208  *                                      csum_and_copy_from_user() if possible.
209  *
210  *              This program is free software; you can redistribute it and/or
211  *              modify it under the terms of the GNU General Public License
212  *              as published by the Free Software Foundation; either version
213  *              2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
218  *
219  *      TCP_SYN_RECV            received a connection request, sent ack,
220  *                              waiting for final ack in three-way handshake.
221  *
222  *      TCP_ESTABLISHED         connection established
223  *
224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
225  *                              transmission of remaining buffered data
226  *
227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
228  *                              to shutdown
229  *
230  *      TCP_CLOSING             both sides have shutdown but we still have
231  *                              data we have to finish sending
232  *
233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
234  *                              closed, can only be entered from FIN_WAIT2
235  *                              or CLOSING.  Required because the other end
236  *                              may not have gotten our last ACK causing it
237  *                              to retransmit the data packet (which we ignore)
238  *
239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
240  *                              us to finish writing our data and to shutdown
241  *                              (we have to close() to move on to LAST_ACK)
242  *
243  *      TCP_LAST_ACK            out side has shutdown after remote has
244  *                              shutdown.  There may still be data in our
245  *                              buffer that we have to finish sending
246  *
247  *      TCP_CLOSE               socket is finished
248  */
249
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259 #include <linux/bootmem.h>
260 #include <linux/cache.h>
261
262 #include <net/icmp.h>
263 #include <net/tcp.h>
264 #include <net/xfrm.h>
265 #include <net/ip.h>
266 #include <net/netdma.h>
267
268 #include <asm/uaccess.h>
269 #include <asm/ioctls.h>
270
271 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
272
273 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
274
275 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
276
277 EXPORT_SYMBOL_GPL(tcp_orphan_count);
278
279 int sysctl_tcp_mem[3] __read_mostly;
280 int sysctl_tcp_wmem[3] __read_mostly;
281 int sysctl_tcp_rmem[3] __read_mostly;
282
283 EXPORT_SYMBOL(sysctl_tcp_mem);
284 EXPORT_SYMBOL(sysctl_tcp_rmem);
285 EXPORT_SYMBOL(sysctl_tcp_wmem);
286
287 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
288 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
289
290 EXPORT_SYMBOL(tcp_memory_allocated);
291 EXPORT_SYMBOL(tcp_sockets_allocated);
292
293 /*
294  * Pressure flag: try to collapse.
295  * Technical note: it is used by multiple contexts non atomically.
296  * All the sk_stream_mem_schedule() is of this nature: accounting
297  * is strict, actions are advisory and have some latency.
298  */
299 int tcp_memory_pressure;
300
301 EXPORT_SYMBOL(tcp_memory_pressure);
302
303 void tcp_enter_memory_pressure(void)
304 {
305         if (!tcp_memory_pressure) {
306                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
307                 tcp_memory_pressure = 1;
308         }
309 }
310
311 EXPORT_SYMBOL(tcp_enter_memory_pressure);
312
313 /*
314  *      Wait for a TCP event.
315  *
316  *      Note that we don't need to lock the socket, as the upper poll layers
317  *      take care of normal races (between the test and the event) and we don't
318  *      go look at any of the socket buffers directly.
319  */
320 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
321 {
322         unsigned int mask;
323         struct sock *sk = sock->sk;
324         struct tcp_sock *tp = tcp_sk(sk);
325
326         poll_wait(file, sk->sk_sleep, wait);
327         if (sk->sk_state == TCP_LISTEN)
328                 return inet_csk_listen_poll(sk);
329
330         /* Socket is not locked. We are protected from async events
331            by poll logic and correct handling of state changes
332            made by another threads is impossible in any case.
333          */
334
335         mask = 0;
336         if (sk->sk_err)
337                 mask = POLLERR;
338
339         /*
340          * POLLHUP is certainly not done right. But poll() doesn't
341          * have a notion of HUP in just one direction, and for a
342          * socket the read side is more interesting.
343          *
344          * Some poll() documentation says that POLLHUP is incompatible
345          * with the POLLOUT/POLLWR flags, so somebody should check this
346          * all. But careful, it tends to be safer to return too many
347          * bits than too few, and you can easily break real applications
348          * if you don't tell them that something has hung up!
349          *
350          * Check-me.
351          *
352          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
353          * our fs/select.c). It means that after we received EOF,
354          * poll always returns immediately, making impossible poll() on write()
355          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
356          * if and only if shutdown has been made in both directions.
357          * Actually, it is interesting to look how Solaris and DUX
358          * solve this dilemma. I would prefer, if PULLHUP were maskable,
359          * then we could set it on SND_SHUTDOWN. BTW examples given
360          * in Stevens' books assume exactly this behaviour, it explains
361          * why PULLHUP is incompatible with POLLOUT.    --ANK
362          *
363          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
364          * blocking on fresh not-connected or disconnected socket. --ANK
365          */
366         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
367                 mask |= POLLHUP;
368         if (sk->sk_shutdown & RCV_SHUTDOWN)
369                 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
370
371         /* Connected? */
372         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
373                 /* Potential race condition. If read of tp below will
374                  * escape above sk->sk_state, we can be illegally awaken
375                  * in SYN_* states. */
376                 if ((tp->rcv_nxt != tp->copied_seq) &&
377                     (tp->urg_seq != tp->copied_seq ||
378                      tp->rcv_nxt != tp->copied_seq + 1 ||
379                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
380                         mask |= POLLIN | POLLRDNORM;
381
382                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
383                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
384                                 mask |= POLLOUT | POLLWRNORM;
385                         } else {  /* send SIGIO later */
386                                 set_bit(SOCK_ASYNC_NOSPACE,
387                                         &sk->sk_socket->flags);
388                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
389
390                                 /* Race breaker. If space is freed after
391                                  * wspace test but before the flags are set,
392                                  * IO signal will be lost.
393                                  */
394                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
395                                         mask |= POLLOUT | POLLWRNORM;
396                         }
397                 }
398
399                 if (tp->urg_data & TCP_URG_VALID)
400                         mask |= POLLPRI;
401         }
402         return mask;
403 }
404
405 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
406 {
407         struct tcp_sock *tp = tcp_sk(sk);
408         int answ;
409
410         switch (cmd) {
411         case SIOCINQ:
412                 if (sk->sk_state == TCP_LISTEN)
413                         return -EINVAL;
414
415                 lock_sock(sk);
416                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
417                         answ = 0;
418                 else if (sock_flag(sk, SOCK_URGINLINE) ||
419                          !tp->urg_data ||
420                          before(tp->urg_seq, tp->copied_seq) ||
421                          !before(tp->urg_seq, tp->rcv_nxt)) {
422                         answ = tp->rcv_nxt - tp->copied_seq;
423
424                         /* Subtract 1, if FIN is in queue. */
425                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
426                                 answ -=
427                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
428                 } else
429                         answ = tp->urg_seq - tp->copied_seq;
430                 release_sock(sk);
431                 break;
432         case SIOCATMARK:
433                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
434                 break;
435         case SIOCOUTQ:
436                 if (sk->sk_state == TCP_LISTEN)
437                         return -EINVAL;
438
439                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
440                         answ = 0;
441                 else
442                         answ = tp->write_seq - tp->snd_una;
443                 break;
444         default:
445                 return -ENOIOCTLCMD;
446         };
447
448         return put_user(answ, (int __user *)arg);
449 }
450
451 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
452 {
453         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
454         tp->pushed_seq = tp->write_seq;
455 }
456
457 static inline int forced_push(struct tcp_sock *tp)
458 {
459         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
460 }
461
462 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
463                               struct sk_buff *skb)
464 {
465         skb->csum = 0;
466         TCP_SKB_CB(skb)->seq = tp->write_seq;
467         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
468         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
469         TCP_SKB_CB(skb)->sacked = 0;
470         skb_header_release(skb);
471         __skb_queue_tail(&sk->sk_write_queue, skb);
472         sk_charge_skb(sk, skb);
473         if (!sk->sk_send_head)
474                 sk->sk_send_head = skb;
475         if (tp->nonagle & TCP_NAGLE_PUSH)
476                 tp->nonagle &= ~TCP_NAGLE_PUSH; 
477 }
478
479 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
480                                 struct sk_buff *skb)
481 {
482         if (flags & MSG_OOB) {
483                 tp->urg_mode = 1;
484                 tp->snd_up = tp->write_seq;
485                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
486         }
487 }
488
489 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
490                             int mss_now, int nonagle)
491 {
492         if (sk->sk_send_head) {
493                 struct sk_buff *skb = sk->sk_write_queue.prev;
494                 if (!(flags & MSG_MORE) || forced_push(tp))
495                         tcp_mark_push(tp, skb);
496                 tcp_mark_urg(tp, flags, skb);
497                 __tcp_push_pending_frames(sk, tp, mss_now,
498                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
499         }
500 }
501
502 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
503                          size_t psize, int flags)
504 {
505         struct tcp_sock *tp = tcp_sk(sk);
506         int mss_now, size_goal;
507         int err;
508         ssize_t copied;
509         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
510
511         /* Wait for a connection to finish. */
512         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
513                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
514                         goto out_err;
515
516         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
517
518         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
519         size_goal = tp->xmit_size_goal;
520         copied = 0;
521
522         err = -EPIPE;
523         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
524                 goto do_error;
525
526         while (psize > 0) {
527                 struct sk_buff *skb = sk->sk_write_queue.prev;
528                 struct page *page = pages[poffset / PAGE_SIZE];
529                 int copy, i, can_coalesce;
530                 int offset = poffset % PAGE_SIZE;
531                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
532
533                 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
534 new_segment:
535                         if (!sk_stream_memory_free(sk))
536                                 goto wait_for_sndbuf;
537
538                         skb = sk_stream_alloc_pskb(sk, 0, 0,
539                                                    sk->sk_allocation);
540                         if (!skb)
541                                 goto wait_for_memory;
542
543                         skb_entail(sk, tp, skb);
544                         copy = size_goal;
545                 }
546
547                 if (copy > size)
548                         copy = size;
549
550                 i = skb_shinfo(skb)->nr_frags;
551                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
552                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
553                         tcp_mark_push(tp, skb);
554                         goto new_segment;
555                 }
556                 if (!sk_stream_wmem_schedule(sk, copy))
557                         goto wait_for_memory;
558                 
559                 if (can_coalesce) {
560                         skb_shinfo(skb)->frags[i - 1].size += copy;
561                 } else {
562                         get_page(page);
563                         skb_fill_page_desc(skb, i, page, offset, copy);
564                 }
565
566                 skb->len += copy;
567                 skb->data_len += copy;
568                 skb->truesize += copy;
569                 sk->sk_wmem_queued += copy;
570                 sk->sk_forward_alloc -= copy;
571                 skb->ip_summed = CHECKSUM_HW;
572                 tp->write_seq += copy;
573                 TCP_SKB_CB(skb)->end_seq += copy;
574                 skb_shinfo(skb)->tso_segs = 0;
575
576                 if (!copied)
577                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
578
579                 copied += copy;
580                 poffset += copy;
581                 if (!(psize -= copy))
582                         goto out;
583
584                 if (skb->len < mss_now || (flags & MSG_OOB))
585                         continue;
586
587                 if (forced_push(tp)) {
588                         tcp_mark_push(tp, skb);
589                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
590                 } else if (skb == sk->sk_send_head)
591                         tcp_push_one(sk, mss_now);
592                 continue;
593
594 wait_for_sndbuf:
595                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
596 wait_for_memory:
597                 if (copied)
598                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
599
600                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
601                         goto do_error;
602
603                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
604                 size_goal = tp->xmit_size_goal;
605         }
606
607 out:
608         if (copied)
609                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
610         return copied;
611
612 do_error:
613         if (copied)
614                 goto out;
615 out_err:
616         return sk_stream_error(sk, flags, err);
617 }
618
619 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
620                      size_t size, int flags)
621 {
622         ssize_t res;
623         struct sock *sk = sock->sk;
624
625 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
626
627         if (!(sk->sk_route_caps & NETIF_F_SG) ||
628             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
629                 return sock_no_sendpage(sock, page, offset, size, flags);
630
631 #undef TCP_ZC_CSUM_FLAGS
632
633         lock_sock(sk);
634         TCP_CHECK_TIMER(sk);
635         res = do_tcp_sendpages(sk, &page, offset, size, flags);
636         TCP_CHECK_TIMER(sk);
637         release_sock(sk);
638         return res;
639 }
640
641 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
642 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
643
644 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
645 {
646         int tmp = tp->mss_cache;
647
648         if (sk->sk_route_caps & NETIF_F_SG) {
649                 if (sk->sk_route_caps & NETIF_F_TSO)
650                         tmp = 0;
651                 else {
652                         int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
653
654                         if (tmp >= pgbreak &&
655                             tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
656                                 tmp = pgbreak;
657                 }
658         }
659
660         return tmp;
661 }
662
663 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
664                 size_t size)
665 {
666         struct iovec *iov;
667         struct tcp_sock *tp = tcp_sk(sk);
668         struct sk_buff *skb;
669         int iovlen, flags;
670         int mss_now, size_goal;
671         int err, copied;
672         long timeo;
673
674         lock_sock(sk);
675         TCP_CHECK_TIMER(sk);
676
677         flags = msg->msg_flags;
678         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
679
680         /* Wait for a connection to finish. */
681         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
682                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
683                         goto out_err;
684
685         /* This should be in poll */
686         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
687
688         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
689         size_goal = tp->xmit_size_goal;
690
691         /* Ok commence sending. */
692         iovlen = msg->msg_iovlen;
693         iov = msg->msg_iov;
694         copied = 0;
695
696         err = -EPIPE;
697         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
698                 goto do_error;
699
700         while (--iovlen >= 0) {
701                 int seglen = iov->iov_len;
702                 unsigned char __user *from = iov->iov_base;
703
704                 iov++;
705
706                 while (seglen > 0) {
707                         int copy;
708
709                         skb = sk->sk_write_queue.prev;
710
711                         if (!sk->sk_send_head ||
712                             (copy = size_goal - skb->len) <= 0) {
713
714 new_segment:
715                                 /* Allocate new segment. If the interface is SG,
716                                  * allocate skb fitting to single page.
717                                  */
718                                 if (!sk_stream_memory_free(sk))
719                                         goto wait_for_sndbuf;
720
721                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
722                                                            0, sk->sk_allocation);
723                                 if (!skb)
724                                         goto wait_for_memory;
725
726                                 /*
727                                  * Check whether we can use HW checksum.
728                                  */
729                                 if (sk->sk_route_caps &
730                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
731                                      NETIF_F_HW_CSUM))
732                                         skb->ip_summed = CHECKSUM_HW;
733
734                                 skb_entail(sk, tp, skb);
735                                 copy = size_goal;
736                         }
737
738                         /* Try to append data to the end of skb. */
739                         if (copy > seglen)
740                                 copy = seglen;
741
742                         /* Where to copy to? */
743                         if (skb_tailroom(skb) > 0) {
744                                 /* We have some space in skb head. Superb! */
745                                 if (copy > skb_tailroom(skb))
746                                         copy = skb_tailroom(skb);
747                                 if ((err = skb_add_data(skb, from, copy)) != 0)
748                                         goto do_fault;
749                         } else {
750                                 int merge = 0;
751                                 int i = skb_shinfo(skb)->nr_frags;
752                                 struct page *page = TCP_PAGE(sk);
753                                 int off = TCP_OFF(sk);
754
755                                 if (skb_can_coalesce(skb, i, page, off) &&
756                                     off != PAGE_SIZE) {
757                                         /* We can extend the last page
758                                          * fragment. */
759                                         merge = 1;
760                                 } else if (i == MAX_SKB_FRAGS ||
761                                            (!i &&
762                                            !(sk->sk_route_caps & NETIF_F_SG))) {
763                                         /* Need to add new fragment and cannot
764                                          * do this because interface is non-SG,
765                                          * or because all the page slots are
766                                          * busy. */
767                                         tcp_mark_push(tp, skb);
768                                         goto new_segment;
769                                 } else if (page) {
770                                         if (off == PAGE_SIZE) {
771                                                 put_page(page);
772                                                 TCP_PAGE(sk) = page = NULL;
773                                                 off = 0;
774                                         }
775                                 } else
776                                         off = 0;
777
778                                 if (copy > PAGE_SIZE - off)
779                                         copy = PAGE_SIZE - off;
780
781                                 if (!sk_stream_wmem_schedule(sk, copy))
782                                         goto wait_for_memory;
783
784                                 if (!page) {
785                                         /* Allocate new cache page. */
786                                         if (!(page = sk_stream_alloc_page(sk)))
787                                                 goto wait_for_memory;
788                                 }
789
790                                 /* Time to copy data. We are close to
791                                  * the end! */
792                                 err = skb_copy_to_page(sk, from, skb, page,
793                                                        off, copy);
794                                 if (err) {
795                                         /* If this page was new, give it to the
796                                          * socket so it does not get leaked.
797                                          */
798                                         if (!TCP_PAGE(sk)) {
799                                                 TCP_PAGE(sk) = page;
800                                                 TCP_OFF(sk) = 0;
801                                         }
802                                         goto do_error;
803                                 }
804
805                                 /* Update the skb. */
806                                 if (merge) {
807                                         skb_shinfo(skb)->frags[i - 1].size +=
808                                                                         copy;
809                                 } else {
810                                         skb_fill_page_desc(skb, i, page, off, copy);
811                                         if (TCP_PAGE(sk)) {
812                                                 get_page(page);
813                                         } else if (off + copy < PAGE_SIZE) {
814                                                 get_page(page);
815                                                 TCP_PAGE(sk) = page;
816                                         }
817                                 }
818
819                                 TCP_OFF(sk) = off + copy;
820                         }
821
822                         if (!copied)
823                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
824
825                         tp->write_seq += copy;
826                         TCP_SKB_CB(skb)->end_seq += copy;
827                         skb_shinfo(skb)->tso_segs = 0;
828
829                         from += copy;
830                         copied += copy;
831                         if ((seglen -= copy) == 0 && iovlen == 0)
832                                 goto out;
833
834                         if (skb->len < mss_now || (flags & MSG_OOB))
835                                 continue;
836
837                         if (forced_push(tp)) {
838                                 tcp_mark_push(tp, skb);
839                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
840                         } else if (skb == sk->sk_send_head)
841                                 tcp_push_one(sk, mss_now);
842                         continue;
843
844 wait_for_sndbuf:
845                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
846 wait_for_memory:
847                         if (copied)
848                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
849
850                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
851                                 goto do_error;
852
853                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
854                         size_goal = tp->xmit_size_goal;
855                 }
856         }
857
858 out:
859         if (copied)
860                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
861         TCP_CHECK_TIMER(sk);
862         release_sock(sk);
863         return copied;
864
865 do_fault:
866         if (!skb->len) {
867                 if (sk->sk_send_head == skb)
868                         sk->sk_send_head = NULL;
869                 __skb_unlink(skb, &sk->sk_write_queue);
870                 sk_stream_free_skb(sk, skb);
871         }
872
873 do_error:
874         if (copied)
875                 goto out;
876 out_err:
877         err = sk_stream_error(sk, flags, err);
878         TCP_CHECK_TIMER(sk);
879         release_sock(sk);
880         return err;
881 }
882
883 /*
884  *      Handle reading urgent data. BSD has very simple semantics for
885  *      this, no blocking and very strange errors 8)
886  */
887
888 static int tcp_recv_urg(struct sock *sk, long timeo,
889                         struct msghdr *msg, int len, int flags,
890                         int *addr_len)
891 {
892         struct tcp_sock *tp = tcp_sk(sk);
893
894         /* No URG data to read. */
895         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
896             tp->urg_data == TCP_URG_READ)
897                 return -EINVAL; /* Yes this is right ! */
898
899         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
900                 return -ENOTCONN;
901
902         if (tp->urg_data & TCP_URG_VALID) {
903                 int err = 0;
904                 char c = tp->urg_data;
905
906                 if (!(flags & MSG_PEEK))
907                         tp->urg_data = TCP_URG_READ;
908
909                 /* Read urgent data. */
910                 msg->msg_flags |= MSG_OOB;
911
912                 if (len > 0) {
913                         if (!(flags & MSG_TRUNC))
914                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
915                         len = 1;
916                 } else
917                         msg->msg_flags |= MSG_TRUNC;
918
919                 return err ? -EFAULT : len;
920         }
921
922         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
923                 return 0;
924
925         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
926          * the available implementations agree in this case:
927          * this call should never block, independent of the
928          * blocking state of the socket.
929          * Mike <pall@rz.uni-karlsruhe.de>
930          */
931         return -EAGAIN;
932 }
933
934 /* Clean up the receive buffer for full frames taken by the user,
935  * then send an ACK if necessary.  COPIED is the number of bytes
936  * tcp_recvmsg has given to the user so far, it speeds up the
937  * calculation of whether or not we must ACK for the sake of
938  * a window update.
939  */
940 void tcp_cleanup_rbuf(struct sock *sk, int copied)
941 {
942         struct tcp_sock *tp = tcp_sk(sk);
943         int time_to_ack = 0;
944
945 #if TCP_DEBUG
946         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
947
948         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
949 #endif
950
951         if (inet_csk_ack_scheduled(sk)) {
952                 const struct inet_connection_sock *icsk = inet_csk(sk);
953                    /* Delayed ACKs frequently hit locked sockets during bulk
954                     * receive. */
955                 if (icsk->icsk_ack.blocked ||
956                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
957                     tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
958                     /*
959                      * If this read emptied read buffer, we send ACK, if
960                      * connection is not bidirectional, user drained
961                      * receive buffer and there was a small segment
962                      * in queue.
963                      */
964                     (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
965                      !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
966                         time_to_ack = 1;
967         }
968
969         /* We send an ACK if we can now advertise a non-zero window
970          * which has been raised "significantly".
971          *
972          * Even if window raised up to infinity, do not send window open ACK
973          * in states, where we will not receive more. It is useless.
974          */
975         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
976                 __u32 rcv_window_now = tcp_receive_window(tp);
977
978                 /* Optimize, __tcp_select_window() is not cheap. */
979                 if (2*rcv_window_now <= tp->window_clamp) {
980                         __u32 new_window = __tcp_select_window(sk);
981
982                         /* Send ACK now, if this read freed lots of space
983                          * in our buffer. Certainly, new_window is new window.
984                          * We can advertise it now, if it is not less than current one.
985                          * "Lots" means "at least twice" here.
986                          */
987                         if (new_window && new_window >= 2 * rcv_window_now)
988                                 time_to_ack = 1;
989                 }
990         }
991         if (time_to_ack)
992                 tcp_send_ack(sk);
993 }
994
995 static void tcp_prequeue_process(struct sock *sk)
996 {
997         struct sk_buff *skb;
998         struct tcp_sock *tp = tcp_sk(sk);
999
1000         NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1001
1002         /* RX process wants to run with disabled BHs, though it is not
1003          * necessary */
1004         local_bh_disable();
1005         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1006                 sk->sk_backlog_rcv(sk, skb);
1007         local_bh_enable();
1008
1009         /* Clear memory counter. */
1010         tp->ucopy.memory = 0;
1011 }
1012
1013 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1014 {
1015         struct sk_buff *skb;
1016         u32 offset;
1017
1018         skb_queue_walk(&sk->sk_receive_queue, skb) {
1019                 offset = seq - TCP_SKB_CB(skb)->seq;
1020                 if (skb->h.th->syn)
1021                         offset--;
1022                 if (offset < skb->len || skb->h.th->fin) {
1023                         *off = offset;
1024                         return skb;
1025                 }
1026         }
1027         return NULL;
1028 }
1029
1030 /*
1031  * This routine provides an alternative to tcp_recvmsg() for routines
1032  * that would like to handle copying from skbuffs directly in 'sendfile'
1033  * fashion.
1034  * Note:
1035  *      - It is assumed that the socket was locked by the caller.
1036  *      - The routine does not block.
1037  *      - At present, there is no support for reading OOB data
1038  *        or for 'peeking' the socket using this routine
1039  *        (although both would be easy to implement).
1040  */
1041 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1042                   sk_read_actor_t recv_actor)
1043 {
1044         struct sk_buff *skb;
1045         struct tcp_sock *tp = tcp_sk(sk);
1046         u32 seq = tp->copied_seq;
1047         u32 offset;
1048         int copied = 0;
1049
1050         if (sk->sk_state == TCP_LISTEN)
1051                 return -ENOTCONN;
1052         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1053                 if (offset < skb->len) {
1054                         size_t used, len;
1055
1056                         len = skb->len - offset;
1057                         /* Stop reading if we hit a patch of urgent data */
1058                         if (tp->urg_data) {
1059                                 u32 urg_offset = tp->urg_seq - seq;
1060                                 if (urg_offset < len)
1061                                         len = urg_offset;
1062                                 if (!len)
1063                                         break;
1064                         }
1065                         used = recv_actor(desc, skb, offset, len);
1066                         if (used <= len) {
1067                                 seq += used;
1068                                 copied += used;
1069                                 offset += used;
1070                         }
1071                         if (offset != skb->len)
1072                                 break;
1073                 }
1074                 if (skb->h.th->fin) {
1075                         sk_eat_skb(sk, skb, 0);
1076                         ++seq;
1077                         break;
1078                 }
1079                 sk_eat_skb(sk, skb, 0);
1080                 if (!desc->count)
1081                         break;
1082         }
1083         tp->copied_seq = seq;
1084
1085         tcp_rcv_space_adjust(sk);
1086
1087         /* Clean up data we have read: This will do ACK frames. */
1088         if (copied)
1089                 tcp_cleanup_rbuf(sk, copied);
1090         return copied;
1091 }
1092
1093 /*
1094  *      This routine copies from a sock struct into the user buffer.
1095  *
1096  *      Technical note: in 2.3 we work on _locked_ socket, so that
1097  *      tricks with *seq access order and skb->users are not required.
1098  *      Probably, code can be easily improved even more.
1099  */
1100
1101 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1102                 size_t len, int nonblock, int flags, int *addr_len)
1103 {
1104         struct tcp_sock *tp = tcp_sk(sk);
1105         int copied = 0;
1106         u32 peek_seq;
1107         u32 *seq;
1108         unsigned long used;
1109         int err;
1110         int target;             /* Read at least this many bytes */
1111         long timeo;
1112         struct task_struct *user_recv = NULL;
1113         int copied_early = 0;
1114
1115         lock_sock(sk);
1116
1117         TCP_CHECK_TIMER(sk);
1118
1119         err = -ENOTCONN;
1120         if (sk->sk_state == TCP_LISTEN)
1121                 goto out;
1122
1123         timeo = sock_rcvtimeo(sk, nonblock);
1124
1125         /* Urgent data needs to be handled specially. */
1126         if (flags & MSG_OOB)
1127                 goto recv_urg;
1128
1129         seq = &tp->copied_seq;
1130         if (flags & MSG_PEEK) {
1131                 peek_seq = tp->copied_seq;
1132                 seq = &peek_seq;
1133         }
1134
1135         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1136
1137 #ifdef CONFIG_NET_DMA
1138         tp->ucopy.dma_chan = NULL;
1139         preempt_disable();
1140         if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1141             !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) {
1142                 preempt_enable_no_resched();
1143                 tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len);
1144         } else
1145                 preempt_enable_no_resched();
1146 #endif
1147
1148         do {
1149                 struct sk_buff *skb;
1150                 u32 offset;
1151
1152                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1153                 if (tp->urg_data && tp->urg_seq == *seq) {
1154                         if (copied)
1155                                 break;
1156                         if (signal_pending(current)) {
1157                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1158                                 break;
1159                         }
1160                 }
1161
1162                 /* Next get a buffer. */
1163
1164                 skb = skb_peek(&sk->sk_receive_queue);
1165                 do {
1166                         if (!skb)
1167                                 break;
1168
1169                         /* Now that we have two receive queues this
1170                          * shouldn't happen.
1171                          */
1172                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1173                                 printk(KERN_INFO "recvmsg bug: copied %X "
1174                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1175                                 break;
1176                         }
1177                         offset = *seq - TCP_SKB_CB(skb)->seq;
1178                         if (skb->h.th->syn)
1179                                 offset--;
1180                         if (offset < skb->len)
1181                                 goto found_ok_skb;
1182                         if (skb->h.th->fin)
1183                                 goto found_fin_ok;
1184                         BUG_TRAP(flags & MSG_PEEK);
1185                         skb = skb->next;
1186                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1187
1188                 /* Well, if we have backlog, try to process it now yet. */
1189
1190                 if (copied >= target && !sk->sk_backlog.tail)
1191                         break;
1192
1193                 if (copied) {
1194                         if (sk->sk_err ||
1195                             sk->sk_state == TCP_CLOSE ||
1196                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1197                             !timeo ||
1198                             signal_pending(current) ||
1199                             (flags & MSG_PEEK))
1200                                 break;
1201                 } else {
1202                         if (sock_flag(sk, SOCK_DONE))
1203                                 break;
1204
1205                         if (sk->sk_err) {
1206                                 copied = sock_error(sk);
1207                                 break;
1208                         }
1209
1210                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1211                                 break;
1212
1213                         if (sk->sk_state == TCP_CLOSE) {
1214                                 if (!sock_flag(sk, SOCK_DONE)) {
1215                                         /* This occurs when user tries to read
1216                                          * from never connected socket.
1217                                          */
1218                                         copied = -ENOTCONN;
1219                                         break;
1220                                 }
1221                                 break;
1222                         }
1223
1224                         if (!timeo) {
1225                                 copied = -EAGAIN;
1226                                 break;
1227                         }
1228
1229                         if (signal_pending(current)) {
1230                                 copied = sock_intr_errno(timeo);
1231                                 break;
1232                         }
1233                 }
1234
1235                 tcp_cleanup_rbuf(sk, copied);
1236
1237                 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1238                         /* Install new reader */
1239                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1240                                 user_recv = current;
1241                                 tp->ucopy.task = user_recv;
1242                                 tp->ucopy.iov = msg->msg_iov;
1243                         }
1244
1245                         tp->ucopy.len = len;
1246
1247                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1248                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1249
1250                         /* Ugly... If prequeue is not empty, we have to
1251                          * process it before releasing socket, otherwise
1252                          * order will be broken at second iteration.
1253                          * More elegant solution is required!!!
1254                          *
1255                          * Look: we have the following (pseudo)queues:
1256                          *
1257                          * 1. packets in flight
1258                          * 2. backlog
1259                          * 3. prequeue
1260                          * 4. receive_queue
1261                          *
1262                          * Each queue can be processed only if the next ones
1263                          * are empty. At this point we have empty receive_queue.
1264                          * But prequeue _can_ be not empty after 2nd iteration,
1265                          * when we jumped to start of loop because backlog
1266                          * processing added something to receive_queue.
1267                          * We cannot release_sock(), because backlog contains
1268                          * packets arrived _after_ prequeued ones.
1269                          *
1270                          * Shortly, algorithm is clear --- to process all
1271                          * the queues in order. We could make it more directly,
1272                          * requeueing packets from backlog to prequeue, if
1273                          * is not empty. It is more elegant, but eats cycles,
1274                          * unfortunately.
1275                          */
1276                         if (!skb_queue_empty(&tp->ucopy.prequeue))
1277                                 goto do_prequeue;
1278
1279                         /* __ Set realtime policy in scheduler __ */
1280                 }
1281
1282                 if (copied >= target) {
1283                         /* Do not sleep, just process backlog. */
1284                         release_sock(sk);
1285                         lock_sock(sk);
1286                 } else
1287                         sk_wait_data(sk, &timeo);
1288
1289 #ifdef CONFIG_NET_DMA
1290                 tp->ucopy.wakeup = 0;
1291 #endif
1292
1293                 if (user_recv) {
1294                         int chunk;
1295
1296                         /* __ Restore normal policy in scheduler __ */
1297
1298                         if ((chunk = len - tp->ucopy.len) != 0) {
1299                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1300                                 len -= chunk;
1301                                 copied += chunk;
1302                         }
1303
1304                         if (tp->rcv_nxt == tp->copied_seq &&
1305                             !skb_queue_empty(&tp->ucopy.prequeue)) {
1306 do_prequeue:
1307                                 tcp_prequeue_process(sk);
1308
1309                                 if ((chunk = len - tp->ucopy.len) != 0) {
1310                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1311                                         len -= chunk;
1312                                         copied += chunk;
1313                                 }
1314                         }
1315                 }
1316                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1317                         if (net_ratelimit())
1318                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1319                                        current->comm, current->pid);
1320                         peek_seq = tp->copied_seq;
1321                 }
1322                 continue;
1323
1324         found_ok_skb:
1325                 /* Ok so how much can we use? */
1326                 used = skb->len - offset;
1327                 if (len < used)
1328                         used = len;
1329
1330                 /* Do we have urgent data here? */
1331                 if (tp->urg_data) {
1332                         u32 urg_offset = tp->urg_seq - *seq;
1333                         if (urg_offset < used) {
1334                                 if (!urg_offset) {
1335                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1336                                                 ++*seq;
1337                                                 offset++;
1338                                                 used--;
1339                                                 if (!used)
1340                                                         goto skip_copy;
1341                                         }
1342                                 } else
1343                                         used = urg_offset;
1344                         }
1345                 }
1346
1347                 if (!(flags & MSG_TRUNC)) {
1348 #ifdef CONFIG_NET_DMA
1349                         if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1350                                 tp->ucopy.dma_chan = get_softnet_dma();
1351
1352                         if (tp->ucopy.dma_chan) {
1353                                 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1354                                         tp->ucopy.dma_chan, skb, offset,
1355                                         msg->msg_iov, used,
1356                                         tp->ucopy.pinned_list);
1357
1358                                 if (tp->ucopy.dma_cookie < 0) {
1359
1360                                         printk(KERN_ALERT "dma_cookie < 0\n");
1361
1362                                         /* Exception. Bailout! */
1363                                         if (!copied)
1364                                                 copied = -EFAULT;
1365                                         break;
1366                                 }
1367                                 if ((offset + used) == skb->len)
1368                                         copied_early = 1;
1369
1370                         } else
1371 #endif
1372                         {
1373                                 err = skb_copy_datagram_iovec(skb, offset,
1374                                                 msg->msg_iov, used);
1375                                 if (err) {
1376                                         /* Exception. Bailout! */
1377                                         if (!copied)
1378                                                 copied = -EFAULT;
1379                                         break;
1380                                 }
1381                         }
1382                 }
1383
1384                 *seq += used;
1385                 copied += used;
1386                 len -= used;
1387
1388                 tcp_rcv_space_adjust(sk);
1389
1390 skip_copy:
1391                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1392                         tp->urg_data = 0;
1393                         tcp_fast_path_check(sk, tp);
1394                 }
1395                 if (used + offset < skb->len)
1396                         continue;
1397
1398                 if (skb->h.th->fin)
1399                         goto found_fin_ok;
1400                 if (!(flags & MSG_PEEK)) {
1401                         sk_eat_skb(sk, skb, copied_early);
1402                         copied_early = 0;
1403                 }
1404                 continue;
1405
1406         found_fin_ok:
1407                 /* Process the FIN. */
1408                 ++*seq;
1409                 if (!(flags & MSG_PEEK)) {
1410                         sk_eat_skb(sk, skb, copied_early);
1411                         copied_early = 0;
1412                 }
1413                 break;
1414         } while (len > 0);
1415
1416         if (user_recv) {
1417                 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1418                         int chunk;
1419
1420                         tp->ucopy.len = copied > 0 ? len : 0;
1421
1422                         tcp_prequeue_process(sk);
1423
1424                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1425                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1426                                 len -= chunk;
1427                                 copied += chunk;
1428                         }
1429                 }
1430
1431                 tp->ucopy.task = NULL;
1432                 tp->ucopy.len = 0;
1433         }
1434
1435 #ifdef CONFIG_NET_DMA
1436         if (tp->ucopy.dma_chan) {
1437                 struct sk_buff *skb;
1438                 dma_cookie_t done, used;
1439
1440                 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1441
1442                 while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1443                                                  tp->ucopy.dma_cookie, &done,
1444                                                  &used) == DMA_IN_PROGRESS) {
1445                         /* do partial cleanup of sk_async_wait_queue */
1446                         while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1447                                (dma_async_is_complete(skb->dma_cookie, done,
1448                                                       used) == DMA_SUCCESS)) {
1449                                 __skb_dequeue(&sk->sk_async_wait_queue);
1450                                 kfree_skb(skb);
1451                         }
1452                 }
1453
1454                 /* Safe to free early-copied skbs now */
1455                 __skb_queue_purge(&sk->sk_async_wait_queue);
1456                 dma_chan_put(tp->ucopy.dma_chan);
1457                 tp->ucopy.dma_chan = NULL;
1458         }
1459         if (tp->ucopy.pinned_list) {
1460                 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1461                 tp->ucopy.pinned_list = NULL;
1462         }
1463 #endif
1464
1465         /* According to UNIX98, msg_name/msg_namelen are ignored
1466          * on connected socket. I was just happy when found this 8) --ANK
1467          */
1468
1469         /* Clean up data we have read: This will do ACK frames. */
1470         tcp_cleanup_rbuf(sk, copied);
1471
1472         TCP_CHECK_TIMER(sk);
1473         release_sock(sk);
1474         return copied;
1475
1476 out:
1477         TCP_CHECK_TIMER(sk);
1478         release_sock(sk);
1479         return err;
1480
1481 recv_urg:
1482         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1483         goto out;
1484 }
1485
1486 /*
1487  *      State processing on a close. This implements the state shift for
1488  *      sending our FIN frame. Note that we only send a FIN for some
1489  *      states. A shutdown() may have already sent the FIN, or we may be
1490  *      closed.
1491  */
1492
1493 static const unsigned char new_state[16] = {
1494   /* current state:        new state:      action:      */
1495   /* (Invalid)          */ TCP_CLOSE,
1496   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1497   /* TCP_SYN_SENT       */ TCP_CLOSE,
1498   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1499   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1500   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1501   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1502   /* TCP_CLOSE          */ TCP_CLOSE,
1503   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1504   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1505   /* TCP_LISTEN         */ TCP_CLOSE,
1506   /* TCP_CLOSING        */ TCP_CLOSING,
1507 };
1508
1509 static int tcp_close_state(struct sock *sk)
1510 {
1511         int next = (int)new_state[sk->sk_state];
1512         int ns = next & TCP_STATE_MASK;
1513
1514         tcp_set_state(sk, ns);
1515
1516         return next & TCP_ACTION_FIN;
1517 }
1518
1519 /*
1520  *      Shutdown the sending side of a connection. Much like close except
1521  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1522  */
1523
1524 void tcp_shutdown(struct sock *sk, int how)
1525 {
1526         /*      We need to grab some memory, and put together a FIN,
1527          *      and then put it into the queue to be sent.
1528          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1529          */
1530         if (!(how & SEND_SHUTDOWN))
1531                 return;
1532
1533         /* If we've already sent a FIN, or it's a closed state, skip this. */
1534         if ((1 << sk->sk_state) &
1535             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1536              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1537                 /* Clear out any half completed packets.  FIN if needed. */
1538                 if (tcp_close_state(sk))
1539                         tcp_send_fin(sk);
1540         }
1541 }
1542
1543 void tcp_close(struct sock *sk, long timeout)
1544 {
1545         struct sk_buff *skb;
1546         int data_was_unread = 0;
1547         int state;
1548
1549         lock_sock(sk);
1550         sk->sk_shutdown = SHUTDOWN_MASK;
1551
1552         if (sk->sk_state == TCP_LISTEN) {
1553                 tcp_set_state(sk, TCP_CLOSE);
1554
1555                 /* Special case. */
1556                 inet_csk_listen_stop(sk);
1557
1558                 goto adjudge_to_death;
1559         }
1560
1561         /*  We need to flush the recv. buffs.  We do this only on the
1562          *  descriptor close, not protocol-sourced closes, because the
1563          *  reader process may not have drained the data yet!
1564          */
1565         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1566                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1567                           skb->h.th->fin;
1568                 data_was_unread += len;
1569                 __kfree_skb(skb);
1570         }
1571
1572         sk_stream_mem_reclaim(sk);
1573
1574         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1575          * 3.10, we send a RST here because data was lost.  To
1576          * witness the awful effects of the old behavior of always
1577          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1578          * a bulk GET in an FTP client, suspend the process, wait
1579          * for the client to advertise a zero window, then kill -9
1580          * the FTP client, wheee...  Note: timeout is always zero
1581          * in such a case.
1582          */
1583         if (data_was_unread) {
1584                 /* Unread data was tossed, zap the connection. */
1585                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1586                 tcp_set_state(sk, TCP_CLOSE);
1587                 tcp_send_active_reset(sk, GFP_KERNEL);
1588         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1589                 /* Check zero linger _after_ checking for unread data. */
1590                 sk->sk_prot->disconnect(sk, 0);
1591                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1592         } else if (tcp_close_state(sk)) {
1593                 /* We FIN if the application ate all the data before
1594                  * zapping the connection.
1595                  */
1596
1597                 /* RED-PEN. Formally speaking, we have broken TCP state
1598                  * machine. State transitions:
1599                  *
1600                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1601                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1602                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1603                  *
1604                  * are legal only when FIN has been sent (i.e. in window),
1605                  * rather than queued out of window. Purists blame.
1606                  *
1607                  * F.e. "RFC state" is ESTABLISHED,
1608                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1609                  *
1610                  * The visible declinations are that sometimes
1611                  * we enter time-wait state, when it is not required really
1612                  * (harmless), do not send active resets, when they are
1613                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1614                  * they look as CLOSING or LAST_ACK for Linux)
1615                  * Probably, I missed some more holelets.
1616                  *                                              --ANK
1617                  */
1618                 tcp_send_fin(sk);
1619         }
1620
1621         sk_stream_wait_close(sk, timeout);
1622
1623 adjudge_to_death:
1624         state = sk->sk_state;
1625         sock_hold(sk);
1626         sock_orphan(sk);
1627         atomic_inc(sk->sk_prot->orphan_count);
1628
1629         /* It is the last release_sock in its life. It will remove backlog. */
1630         release_sock(sk);
1631
1632
1633         /* Now socket is owned by kernel and we acquire BH lock
1634            to finish close. No need to check for user refs.
1635          */
1636         local_bh_disable();
1637         bh_lock_sock(sk);
1638         BUG_TRAP(!sock_owned_by_user(sk));
1639
1640         /* Have we already been destroyed by a softirq or backlog? */
1641         if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
1642                 goto out;
1643
1644         /*      This is a (useful) BSD violating of the RFC. There is a
1645          *      problem with TCP as specified in that the other end could
1646          *      keep a socket open forever with no application left this end.
1647          *      We use a 3 minute timeout (about the same as BSD) then kill
1648          *      our end. If they send after that then tough - BUT: long enough
1649          *      that we won't make the old 4*rto = almost no time - whoops
1650          *      reset mistake.
1651          *
1652          *      Nope, it was not mistake. It is really desired behaviour
1653          *      f.e. on http servers, when such sockets are useless, but
1654          *      consume significant resources. Let's do it with special
1655          *      linger2 option.                                 --ANK
1656          */
1657
1658         if (sk->sk_state == TCP_FIN_WAIT2) {
1659                 struct tcp_sock *tp = tcp_sk(sk);
1660                 if (tp->linger2 < 0) {
1661                         tcp_set_state(sk, TCP_CLOSE);
1662                         tcp_send_active_reset(sk, GFP_ATOMIC);
1663                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1664                 } else {
1665                         const int tmo = tcp_fin_time(sk);
1666
1667                         if (tmo > TCP_TIMEWAIT_LEN) {
1668                                 inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
1669                         } else {
1670                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1671                                 goto out;
1672                         }
1673                 }
1674         }
1675         if (sk->sk_state != TCP_CLOSE) {
1676                 sk_stream_mem_reclaim(sk);
1677                 if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
1678                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1679                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1680                         if (net_ratelimit())
1681                                 printk(KERN_INFO "TCP: too many of orphaned "
1682                                        "sockets\n");
1683                         tcp_set_state(sk, TCP_CLOSE);
1684                         tcp_send_active_reset(sk, GFP_ATOMIC);
1685                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1686                 }
1687         }
1688
1689         if (sk->sk_state == TCP_CLOSE)
1690                 inet_csk_destroy_sock(sk);
1691         /* Otherwise, socket is reprieved until protocol close. */
1692
1693 out:
1694         bh_unlock_sock(sk);
1695         local_bh_enable();
1696         sock_put(sk);
1697 }
1698
1699 /* These states need RST on ABORT according to RFC793 */
1700
1701 static inline int tcp_need_reset(int state)
1702 {
1703         return (1 << state) &
1704                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1705                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1706 }
1707
1708 int tcp_disconnect(struct sock *sk, int flags)
1709 {
1710         struct inet_sock *inet = inet_sk(sk);
1711         struct inet_connection_sock *icsk = inet_csk(sk);
1712         struct tcp_sock *tp = tcp_sk(sk);
1713         int err = 0;
1714         int old_state = sk->sk_state;
1715
1716         if (old_state != TCP_CLOSE)
1717                 tcp_set_state(sk, TCP_CLOSE);
1718
1719         /* ABORT function of RFC793 */
1720         if (old_state == TCP_LISTEN) {
1721                 inet_csk_listen_stop(sk);
1722         } else if (tcp_need_reset(old_state) ||
1723                    (tp->snd_nxt != tp->write_seq &&
1724                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1725                 /* The last check adjusts for discrepancy of Linux wrt. RFC
1726                  * states
1727                  */
1728                 tcp_send_active_reset(sk, gfp_any());
1729                 sk->sk_err = ECONNRESET;
1730         } else if (old_state == TCP_SYN_SENT)
1731                 sk->sk_err = ECONNRESET;
1732
1733         tcp_clear_xmit_timers(sk);
1734         __skb_queue_purge(&sk->sk_receive_queue);
1735         sk_stream_writequeue_purge(sk);
1736         __skb_queue_purge(&tp->out_of_order_queue);
1737 #ifdef CONFIG_NET_DMA
1738         __skb_queue_purge(&sk->sk_async_wait_queue);
1739 #endif
1740
1741         inet->dport = 0;
1742
1743         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1744                 inet_reset_saddr(sk);
1745
1746         sk->sk_shutdown = 0;
1747         sock_reset_flag(sk, SOCK_DONE);
1748         tp->srtt = 0;
1749         if ((tp->write_seq += tp->max_window + 2) == 0)
1750                 tp->write_seq = 1;
1751         icsk->icsk_backoff = 0;
1752         tp->snd_cwnd = 2;
1753         icsk->icsk_probes_out = 0;
1754         tp->packets_out = 0;
1755         tp->snd_ssthresh = 0x7fffffff;
1756         tp->snd_cwnd_cnt = 0;
1757         tp->bytes_acked = 0;
1758         tcp_set_ca_state(sk, TCP_CA_Open);
1759         tcp_clear_retrans(tp);
1760         inet_csk_delack_init(sk);
1761         sk->sk_send_head = NULL;
1762         tp->rx_opt.saw_tstamp = 0;
1763         tcp_sack_reset(&tp->rx_opt);
1764         __sk_dst_reset(sk);
1765
1766         BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
1767
1768         sk->sk_error_report(sk);
1769         return err;
1770 }
1771
1772 /*
1773  *      Socket option code for TCP.
1774  */
1775 static int do_tcp_setsockopt(struct sock *sk, int level,
1776                 int optname, char __user *optval, int optlen)
1777 {
1778         struct tcp_sock *tp = tcp_sk(sk);
1779         struct inet_connection_sock *icsk = inet_csk(sk);
1780         int val;
1781         int err = 0;
1782
1783         /* This is a string value all the others are int's */
1784         if (optname == TCP_CONGESTION) {
1785                 char name[TCP_CA_NAME_MAX];
1786
1787                 if (optlen < 1)
1788                         return -EINVAL;
1789
1790                 val = strncpy_from_user(name, optval,
1791                                         min(TCP_CA_NAME_MAX-1, optlen));
1792                 if (val < 0)
1793                         return -EFAULT;
1794                 name[val] = 0;
1795
1796                 lock_sock(sk);
1797                 err = tcp_set_congestion_control(sk, name);
1798                 release_sock(sk);
1799                 return err;
1800         }
1801
1802         if (optlen < sizeof(int))
1803                 return -EINVAL;
1804
1805         if (get_user(val, (int __user *)optval))
1806                 return -EFAULT;
1807
1808         lock_sock(sk);
1809
1810         switch (optname) {
1811         case TCP_MAXSEG:
1812                 /* Values greater than interface MTU won't take effect. However
1813                  * at the point when this call is done we typically don't yet
1814                  * know which interface is going to be used */
1815                 if (val < 8 || val > MAX_TCP_WINDOW) {
1816                         err = -EINVAL;
1817                         break;
1818                 }
1819                 tp->rx_opt.user_mss = val;
1820                 break;
1821
1822         case TCP_NODELAY:
1823                 if (val) {
1824                         /* TCP_NODELAY is weaker than TCP_CORK, so that
1825                          * this option on corked socket is remembered, but
1826                          * it is not activated until cork is cleared.
1827                          *
1828                          * However, when TCP_NODELAY is set we make
1829                          * an explicit push, which overrides even TCP_CORK
1830                          * for currently queued segments.
1831                          */
1832                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1833                         tcp_push_pending_frames(sk, tp);
1834                 } else {
1835                         tp->nonagle &= ~TCP_NAGLE_OFF;
1836                 }
1837                 break;
1838
1839         case TCP_CORK:
1840                 /* When set indicates to always queue non-full frames.
1841                  * Later the user clears this option and we transmit
1842                  * any pending partial frames in the queue.  This is
1843                  * meant to be used alongside sendfile() to get properly
1844                  * filled frames when the user (for example) must write
1845                  * out headers with a write() call first and then use
1846                  * sendfile to send out the data parts.
1847                  *
1848                  * TCP_CORK can be set together with TCP_NODELAY and it is
1849                  * stronger than TCP_NODELAY.
1850                  */
1851                 if (val) {
1852                         tp->nonagle |= TCP_NAGLE_CORK;
1853                 } else {
1854                         tp->nonagle &= ~TCP_NAGLE_CORK;
1855                         if (tp->nonagle&TCP_NAGLE_OFF)
1856                                 tp->nonagle |= TCP_NAGLE_PUSH;
1857                         tcp_push_pending_frames(sk, tp);
1858                 }
1859                 break;
1860
1861         case TCP_KEEPIDLE:
1862                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
1863                         err = -EINVAL;
1864                 else {
1865                         tp->keepalive_time = val * HZ;
1866                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
1867                             !((1 << sk->sk_state) &
1868                               (TCPF_CLOSE | TCPF_LISTEN))) {
1869                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
1870                                 if (tp->keepalive_time > elapsed)
1871                                         elapsed = tp->keepalive_time - elapsed;
1872                                 else
1873                                         elapsed = 0;
1874                                 inet_csk_reset_keepalive_timer(sk, elapsed);
1875                         }
1876                 }
1877                 break;
1878         case TCP_KEEPINTVL:
1879                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
1880                         err = -EINVAL;
1881                 else
1882                         tp->keepalive_intvl = val * HZ;
1883                 break;
1884         case TCP_KEEPCNT:
1885                 if (val < 1 || val > MAX_TCP_KEEPCNT)
1886                         err = -EINVAL;
1887                 else
1888                         tp->keepalive_probes = val;
1889                 break;
1890         case TCP_SYNCNT:
1891                 if (val < 1 || val > MAX_TCP_SYNCNT)
1892                         err = -EINVAL;
1893                 else
1894                         icsk->icsk_syn_retries = val;
1895                 break;
1896
1897         case TCP_LINGER2:
1898                 if (val < 0)
1899                         tp->linger2 = -1;
1900                 else if (val > sysctl_tcp_fin_timeout / HZ)
1901                         tp->linger2 = 0;
1902                 else
1903                         tp->linger2 = val * HZ;
1904                 break;
1905
1906         case TCP_DEFER_ACCEPT:
1907                 icsk->icsk_accept_queue.rskq_defer_accept = 0;
1908                 if (val > 0) {
1909                         /* Translate value in seconds to number of
1910                          * retransmits */
1911                         while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
1912                                val > ((TCP_TIMEOUT_INIT / HZ) <<
1913                                        icsk->icsk_accept_queue.rskq_defer_accept))
1914                                 icsk->icsk_accept_queue.rskq_defer_accept++;
1915                         icsk->icsk_accept_queue.rskq_defer_accept++;
1916                 }
1917                 break;
1918
1919         case TCP_WINDOW_CLAMP:
1920                 if (!val) {
1921                         if (sk->sk_state != TCP_CLOSE) {
1922                                 err = -EINVAL;
1923                                 break;
1924                         }
1925                         tp->window_clamp = 0;
1926                 } else
1927                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
1928                                                 SOCK_MIN_RCVBUF / 2 : val;
1929                 break;
1930
1931         case TCP_QUICKACK:
1932                 if (!val) {
1933                         icsk->icsk_ack.pingpong = 1;
1934                 } else {
1935                         icsk->icsk_ack.pingpong = 0;
1936                         if ((1 << sk->sk_state) &
1937                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
1938                             inet_csk_ack_scheduled(sk)) {
1939                                 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
1940                                 tcp_cleanup_rbuf(sk, 1);
1941                                 if (!(val & 1))
1942                                         icsk->icsk_ack.pingpong = 1;
1943                         }
1944                 }
1945                 break;
1946
1947         default:
1948                 err = -ENOPROTOOPT;
1949                 break;
1950         };
1951         release_sock(sk);
1952         return err;
1953 }
1954
1955 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1956                    int optlen)
1957 {
1958         struct inet_connection_sock *icsk = inet_csk(sk);
1959
1960         if (level != SOL_TCP)
1961                 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
1962                                                      optval, optlen);
1963         return do_tcp_setsockopt(sk, level, optname, optval, optlen);
1964 }
1965
1966 #ifdef CONFIG_COMPAT
1967 int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
1968                           char __user *optval, int optlen)
1969 {
1970         if (level != SOL_TCP)
1971                 return inet_csk_compat_setsockopt(sk, level, optname,
1972                                                   optval, optlen);
1973         return do_tcp_setsockopt(sk, level, optname, optval, optlen);
1974 }
1975
1976 EXPORT_SYMBOL(compat_tcp_setsockopt);
1977 #endif
1978
1979 /* Return information about state of tcp endpoint in API format. */
1980 void tcp_get_info(struct sock *sk, struct tcp_info *info)
1981 {
1982         struct tcp_sock *tp = tcp_sk(sk);
1983         const struct inet_connection_sock *icsk = inet_csk(sk);
1984         u32 now = tcp_time_stamp;
1985
1986         memset(info, 0, sizeof(*info));
1987
1988         info->tcpi_state = sk->sk_state;
1989         info->tcpi_ca_state = icsk->icsk_ca_state;
1990         info->tcpi_retransmits = icsk->icsk_retransmits;
1991         info->tcpi_probes = icsk->icsk_probes_out;
1992         info->tcpi_backoff = icsk->icsk_backoff;
1993
1994         if (tp->rx_opt.tstamp_ok)
1995                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1996         if (tp->rx_opt.sack_ok)
1997                 info->tcpi_options |= TCPI_OPT_SACK;
1998         if (tp->rx_opt.wscale_ok) {
1999                 info->tcpi_options |= TCPI_OPT_WSCALE;
2000                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2001                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2002         } 
2003
2004         if (tp->ecn_flags&TCP_ECN_OK)
2005                 info->tcpi_options |= TCPI_OPT_ECN;
2006
2007         info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2008         info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2009         info->tcpi_snd_mss = tp->mss_cache;
2010         info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2011
2012         info->tcpi_unacked = tp->packets_out;
2013         info->tcpi_sacked = tp->sacked_out;
2014         info->tcpi_lost = tp->lost_out;
2015         info->tcpi_retrans = tp->retrans_out;
2016         info->tcpi_fackets = tp->fackets_out;
2017
2018         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2019         info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2020         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2021
2022         info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2023         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2024         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2025         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2026         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2027         info->tcpi_snd_cwnd = tp->snd_cwnd;
2028         info->tcpi_advmss = tp->advmss;
2029         info->tcpi_reordering = tp->reordering;
2030
2031         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2032         info->tcpi_rcv_space = tp->rcvq_space.space;
2033
2034         info->tcpi_total_retrans = tp->total_retrans;
2035 }
2036
2037 EXPORT_SYMBOL_GPL(tcp_get_info);
2038
2039 static int do_tcp_getsockopt(struct sock *sk, int level,
2040                 int optname, char __user *optval, int __user *optlen)
2041 {
2042         struct inet_connection_sock *icsk = inet_csk(sk);
2043         struct tcp_sock *tp = tcp_sk(sk);
2044         int val, len;
2045
2046         if (get_user(len, optlen))
2047                 return -EFAULT;
2048
2049         len = min_t(unsigned int, len, sizeof(int));
2050
2051         if (len < 0)
2052                 return -EINVAL;
2053
2054         switch (optname) {
2055         case TCP_MAXSEG:
2056                 val = tp->mss_cache;
2057                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2058                         val = tp->rx_opt.user_mss;
2059                 break;
2060         case TCP_NODELAY:
2061                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2062                 break;
2063         case TCP_CORK:
2064                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2065                 break;
2066         case TCP_KEEPIDLE:
2067                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2068                 break;
2069         case TCP_KEEPINTVL:
2070                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2071                 break;
2072         case TCP_KEEPCNT:
2073                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2074                 break;
2075         case TCP_SYNCNT:
2076                 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2077                 break;
2078         case TCP_LINGER2:
2079                 val = tp->linger2;
2080                 if (val >= 0)
2081                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2082                 break;
2083         case TCP_DEFER_ACCEPT:
2084                 val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
2085                         ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
2086                 break;
2087         case TCP_WINDOW_CLAMP:
2088                 val = tp->window_clamp;
2089                 break;
2090         case TCP_INFO: {
2091                 struct tcp_info info;
2092
2093                 if (get_user(len, optlen))
2094                         return -EFAULT;
2095
2096                 tcp_get_info(sk, &info);
2097
2098                 len = min_t(unsigned int, len, sizeof(info));
2099                 if (put_user(len, optlen))
2100                         return -EFAULT;
2101                 if (copy_to_user(optval, &info, len))
2102                         return -EFAULT;
2103                 return 0;
2104         }
2105         case TCP_QUICKACK:
2106                 val = !icsk->icsk_ack.pingpong;
2107                 break;
2108
2109         case TCP_CONGESTION:
2110                 if (get_user(len, optlen))
2111                         return -EFAULT;
2112                 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2113                 if (put_user(len, optlen))
2114                         return -EFAULT;
2115                 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2116                         return -EFAULT;
2117                 return 0;
2118         default:
2119                 return -ENOPROTOOPT;
2120         };
2121
2122         if (put_user(len, optlen))
2123                 return -EFAULT;
2124         if (copy_to_user(optval, &val, len))
2125                 return -EFAULT;
2126         return 0;
2127 }
2128
2129 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2130                    int __user *optlen)
2131 {
2132         struct inet_connection_sock *icsk = inet_csk(sk);
2133
2134         if (level != SOL_TCP)
2135                 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2136                                                      optval, optlen);
2137         return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2138 }
2139
2140 #ifdef CONFIG_COMPAT
2141 int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2142                           char __user *optval, int __user *optlen)
2143 {
2144         if (level != SOL_TCP)
2145                 return inet_csk_compat_getsockopt(sk, level, optname,
2146                                                   optval, optlen);
2147         return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2148 }
2149
2150 EXPORT_SYMBOL(compat_tcp_getsockopt);
2151 #endif
2152
2153 extern void __skb_cb_too_small_for_tcp(int, int);
2154 extern struct tcp_congestion_ops tcp_reno;
2155
2156 static __initdata unsigned long thash_entries;
2157 static int __init set_thash_entries(char *str)
2158 {
2159         if (!str)
2160                 return 0;
2161         thash_entries = simple_strtoul(str, &str, 0);
2162         return 1;
2163 }
2164 __setup("thash_entries=", set_thash_entries);
2165
2166 void __init tcp_init(void)
2167 {
2168         struct sk_buff *skb = NULL;
2169         unsigned long limit;
2170         int order, i, max_share;
2171
2172         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2173                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2174                                            sizeof(skb->cb));
2175
2176         tcp_hashinfo.bind_bucket_cachep =
2177                 kmem_cache_create("tcp_bind_bucket",
2178                                   sizeof(struct inet_bind_bucket), 0,
2179                                   SLAB_HWCACHE_ALIGN, NULL, NULL);
2180         if (!tcp_hashinfo.bind_bucket_cachep)
2181                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2182
2183         /* Size and allocate the main established and bind bucket
2184          * hash tables.
2185          *
2186          * The methodology is similar to that of the buffer cache.
2187          */
2188         tcp_hashinfo.ehash =
2189                 alloc_large_system_hash("TCP established",
2190                                         sizeof(struct inet_ehash_bucket),
2191                                         thash_entries,
2192                                         (num_physpages >= 128 * 1024) ?
2193                                         13 : 15,
2194                                         HASH_HIGHMEM,
2195                                         &tcp_hashinfo.ehash_size,
2196                                         NULL,
2197                                         0);
2198         tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
2199         for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
2200                 rwlock_init(&tcp_hashinfo.ehash[i].lock);
2201                 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2202         }
2203
2204         tcp_hashinfo.bhash =
2205                 alloc_large_system_hash("TCP bind",
2206                                         sizeof(struct inet_bind_hashbucket),
2207                                         tcp_hashinfo.ehash_size,
2208                                         (num_physpages >= 128 * 1024) ?
2209                                         13 : 15,
2210                                         HASH_HIGHMEM,
2211                                         &tcp_hashinfo.bhash_size,
2212                                         NULL,
2213                                         64 * 1024);
2214         tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2215         for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2216                 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2217                 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2218         }
2219
2220         /* Try to be a bit smarter and adjust defaults depending
2221          * on available memory.
2222          */
2223         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2224                         (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2225                         order++)
2226                 ;
2227         if (order >= 4) {
2228                 sysctl_local_port_range[0] = 32768;
2229                 sysctl_local_port_range[1] = 61000;
2230                 tcp_death_row.sysctl_max_tw_buckets = 180000;
2231                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2232                 sysctl_max_syn_backlog = 1024;
2233         } else if (order < 3) {
2234                 sysctl_local_port_range[0] = 1024 * (3 - order);
2235                 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2236                 sysctl_tcp_max_orphans >>= (3 - order);
2237                 sysctl_max_syn_backlog = 128;
2238         }
2239
2240         sysctl_tcp_mem[0] =  768 << order;
2241         sysctl_tcp_mem[1] = 1024 << order;
2242         sysctl_tcp_mem[2] = 1536 << order;
2243
2244         limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
2245         max_share = min(4UL*1024*1024, limit);
2246
2247         sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM;
2248         sysctl_tcp_wmem[1] = 16*1024;
2249         sysctl_tcp_wmem[2] = max(64*1024, max_share);
2250
2251         sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM;
2252         sysctl_tcp_rmem[1] = 87380;
2253         sysctl_tcp_rmem[2] = max(87380, max_share);
2254
2255         printk(KERN_INFO "TCP: Hash tables configured "
2256                "(established %d bind %d)\n",
2257                tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
2258
2259         tcp_register_congestion_control(&tcp_reno);
2260 }
2261
2262 EXPORT_SYMBOL(tcp_close);
2263 EXPORT_SYMBOL(tcp_disconnect);
2264 EXPORT_SYMBOL(tcp_getsockopt);
2265 EXPORT_SYMBOL(tcp_ioctl);
2266 EXPORT_SYMBOL(tcp_poll);
2267 EXPORT_SYMBOL(tcp_read_sock);
2268 EXPORT_SYMBOL(tcp_recvmsg);
2269 EXPORT_SYMBOL(tcp_sendmsg);
2270 EXPORT_SYMBOL(tcp_sendpage);
2271 EXPORT_SYMBOL(tcp_setsockopt);
2272 EXPORT_SYMBOL(tcp_shutdown);
2273 EXPORT_SYMBOL(tcp_statistics);