Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[safe/jmp/linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/types.h>
55 #include <linux/fcntl.h>
56 #include <linux/module.h>
57 #include <linux/random.h>
58 #include <linux/cache.h>
59 #include <linux/jhash.h>
60 #include <linux/init.h>
61 #include <linux/times.h>
62
63 #include <net/net_namespace.h>
64 #include <net/icmp.h>
65 #include <net/inet_hashtables.h>
66 #include <net/tcp.h>
67 #include <net/transp_v6.h>
68 #include <net/ipv6.h>
69 #include <net/inet_common.h>
70 #include <net/timewait_sock.h>
71 #include <net/xfrm.h>
72 #include <net/netdma.h>
73
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79
80 #include <linux/crypto.h>
81 #include <linux/scatterlist.h>
82
83 int sysctl_tcp_tw_reuse __read_mostly;
84 int sysctl_tcp_low_latency __read_mostly;
85
86 /* Check TCP sequence numbers in ICMP packets. */
87 #define ICMP_MIN_LENGTH 8
88
89 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
90
91 #ifdef CONFIG_TCP_MD5SIG
92 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
93                                                    __be32 addr);
94 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
95                                    __be32 saddr, __be32 daddr,
96                                    struct tcphdr *th, unsigned int tcplen);
97 #else
98 static inline
99 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
100 {
101         return NULL;
102 }
103 #endif
104
105 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
106         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
107         .lhash_users = ATOMIC_INIT(0),
108         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
109 };
110
111 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
112 {
113         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
114                                           ip_hdr(skb)->saddr,
115                                           tcp_hdr(skb)->dest,
116                                           tcp_hdr(skb)->source);
117 }
118
119 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
120 {
121         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
122         struct tcp_sock *tp = tcp_sk(sk);
123
124         /* With PAWS, it is safe from the viewpoint
125            of data integrity. Even without PAWS it is safe provided sequence
126            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
127
128            Actually, the idea is close to VJ's one, only timestamp cache is
129            held not per host, but per port pair and TW bucket is used as state
130            holder.
131
132            If TW bucket has been already destroyed we fall back to VJ's scheme
133            and use initial timestamp retrieved from peer table.
134          */
135         if (tcptw->tw_ts_recent_stamp &&
136             (twp == NULL || (sysctl_tcp_tw_reuse &&
137                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
138                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
139                 if (tp->write_seq == 0)
140                         tp->write_seq = 1;
141                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
142                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
143                 sock_hold(sktw);
144                 return 1;
145         }
146
147         return 0;
148 }
149
150 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
151
152 /* This will initiate an outgoing connection. */
153 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
154 {
155         struct inet_sock *inet = inet_sk(sk);
156         struct tcp_sock *tp = tcp_sk(sk);
157         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
158         struct rtable *rt;
159         __be32 daddr, nexthop;
160         int tmp;
161         int err;
162
163         if (addr_len < sizeof(struct sockaddr_in))
164                 return -EINVAL;
165
166         if (usin->sin_family != AF_INET)
167                 return -EAFNOSUPPORT;
168
169         nexthop = daddr = usin->sin_addr.s_addr;
170         if (inet->opt && inet->opt->srr) {
171                 if (!daddr)
172                         return -EINVAL;
173                 nexthop = inet->opt->faddr;
174         }
175
176         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
177                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
178                                IPPROTO_TCP,
179                                inet->sport, usin->sin_port, sk, 1);
180         if (tmp < 0) {
181                 if (tmp == -ENETUNREACH)
182                         IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
183                 return tmp;
184         }
185
186         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187                 ip_rt_put(rt);
188                 return -ENETUNREACH;
189         }
190
191         if (!inet->opt || !inet->opt->srr)
192                 daddr = rt->rt_dst;
193
194         if (!inet->saddr)
195                 inet->saddr = rt->rt_src;
196         inet->rcv_saddr = inet->saddr;
197
198         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
199                 /* Reset inherited state */
200                 tp->rx_opt.ts_recent       = 0;
201                 tp->rx_opt.ts_recent_stamp = 0;
202                 tp->write_seq              = 0;
203         }
204
205         if (tcp_death_row.sysctl_tw_recycle &&
206             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
207                 struct inet_peer *peer = rt_get_peer(rt);
208                 /*
209                  * VJ's idea. We save last timestamp seen from
210                  * the destination in peer table, when entering state
211                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
212                  * when trying new connection.
213                  */
214                 if (peer != NULL &&
215                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
216                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
217                         tp->rx_opt.ts_recent = peer->tcp_ts;
218                 }
219         }
220
221         inet->dport = usin->sin_port;
222         inet->daddr = daddr;
223
224         inet_csk(sk)->icsk_ext_hdr_len = 0;
225         if (inet->opt)
226                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
227
228         tp->rx_opt.mss_clamp = 536;
229
230         /* Socket identity is still unknown (sport may be zero).
231          * However we set state to SYN-SENT and not releasing socket
232          * lock select source port, enter ourselves into the hash tables and
233          * complete initialization after this.
234          */
235         tcp_set_state(sk, TCP_SYN_SENT);
236         err = inet_hash_connect(&tcp_death_row, sk);
237         if (err)
238                 goto failure;
239
240         err = ip_route_newports(&rt, IPPROTO_TCP,
241                                 inet->sport, inet->dport, sk);
242         if (err)
243                 goto failure;
244
245         /* OK, now commit destination to socket.  */
246         sk->sk_gso_type = SKB_GSO_TCPV4;
247         sk_setup_caps(sk, &rt->u.dst);
248
249         if (!tp->write_seq)
250                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
251                                                            inet->daddr,
252                                                            inet->sport,
253                                                            usin->sin_port);
254
255         inet->id = tp->write_seq ^ jiffies;
256
257         err = tcp_connect(sk);
258         rt = NULL;
259         if (err)
260                 goto failure;
261
262         return 0;
263
264 failure:
265         /*
266          * This unhashes the socket and releases the local port,
267          * if necessary.
268          */
269         tcp_set_state(sk, TCP_CLOSE);
270         ip_rt_put(rt);
271         sk->sk_route_caps = 0;
272         inet->dport = 0;
273         return err;
274 }
275
276 /*
277  * This routine does path mtu discovery as defined in RFC1191.
278  */
279 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
280 {
281         struct dst_entry *dst;
282         struct inet_sock *inet = inet_sk(sk);
283
284         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
285          * send out by Linux are always <576bytes so they should go through
286          * unfragmented).
287          */
288         if (sk->sk_state == TCP_LISTEN)
289                 return;
290
291         /* We don't check in the destentry if pmtu discovery is forbidden
292          * on this route. We just assume that no packet_to_big packets
293          * are send back when pmtu discovery is not active.
294          * There is a small race when the user changes this flag in the
295          * route, but I think that's acceptable.
296          */
297         if ((dst = __sk_dst_check(sk, 0)) == NULL)
298                 return;
299
300         dst->ops->update_pmtu(dst, mtu);
301
302         /* Something is about to be wrong... Remember soft error
303          * for the case, if this connection will not able to recover.
304          */
305         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
306                 sk->sk_err_soft = EMSGSIZE;
307
308         mtu = dst_mtu(dst);
309
310         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
311             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
312                 tcp_sync_mss(sk, mtu);
313
314                 /* Resend the TCP packet because it's
315                  * clear that the old packet has been
316                  * dropped. This is the new "fast" path mtu
317                  * discovery.
318                  */
319                 tcp_simple_retransmit(sk);
320         } /* else let the usual retransmit timer handle it */
321 }
322
323 /*
324  * This routine is called by the ICMP module when it gets some
325  * sort of error condition.  If err < 0 then the socket should
326  * be closed and the error returned to the user.  If err > 0
327  * it's just the icmp type << 8 | icmp code.  After adjustment
328  * header points to the first 8 bytes of the tcp header.  We need
329  * to find the appropriate port.
330  *
331  * The locking strategy used here is very "optimistic". When
332  * someone else accesses the socket the ICMP is just dropped
333  * and for some paths there is no check at all.
334  * A more general error queue to queue errors for later handling
335  * is probably better.
336  *
337  */
338
339 void tcp_v4_err(struct sk_buff *skb, u32 info)
340 {
341         struct iphdr *iph = (struct iphdr *)skb->data;
342         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
343         struct tcp_sock *tp;
344         struct inet_sock *inet;
345         const int type = icmp_hdr(skb)->type;
346         const int code = icmp_hdr(skb)->code;
347         struct sock *sk;
348         __u32 seq;
349         int err;
350
351         if (skb->len < (iph->ihl << 2) + 8) {
352                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
353                 return;
354         }
355
356         sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest,
357                         iph->saddr, th->source, inet_iif(skb));
358         if (!sk) {
359                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
360                 return;
361         }
362         if (sk->sk_state == TCP_TIME_WAIT) {
363                 inet_twsk_put(inet_twsk(sk));
364                 return;
365         }
366
367         bh_lock_sock(sk);
368         /* If too many ICMPs get dropped on busy
369          * servers this needs to be solved differently.
370          */
371         if (sock_owned_by_user(sk))
372                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
373
374         if (sk->sk_state == TCP_CLOSE)
375                 goto out;
376
377         tp = tcp_sk(sk);
378         seq = ntohl(th->seq);
379         if (sk->sk_state != TCP_LISTEN &&
380             !between(seq, tp->snd_una, tp->snd_nxt)) {
381                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
382                 goto out;
383         }
384
385         switch (type) {
386         case ICMP_SOURCE_QUENCH:
387                 /* Just silently ignore these. */
388                 goto out;
389         case ICMP_PARAMETERPROB:
390                 err = EPROTO;
391                 break;
392         case ICMP_DEST_UNREACH:
393                 if (code > NR_ICMP_UNREACH)
394                         goto out;
395
396                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
397                         if (!sock_owned_by_user(sk))
398                                 do_pmtu_discovery(sk, iph, info);
399                         goto out;
400                 }
401
402                 err = icmp_err_convert[code].errno;
403                 break;
404         case ICMP_TIME_EXCEEDED:
405                 err = EHOSTUNREACH;
406                 break;
407         default:
408                 goto out;
409         }
410
411         switch (sk->sk_state) {
412                 struct request_sock *req, **prev;
413         case TCP_LISTEN:
414                 if (sock_owned_by_user(sk))
415                         goto out;
416
417                 req = inet_csk_search_req(sk, &prev, th->dest,
418                                           iph->daddr, iph->saddr);
419                 if (!req)
420                         goto out;
421
422                 /* ICMPs are not backlogged, hence we cannot get
423                    an established socket here.
424                  */
425                 BUG_TRAP(!req->sk);
426
427                 if (seq != tcp_rsk(req)->snt_isn) {
428                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
429                         goto out;
430                 }
431
432                 /*
433                  * Still in SYN_RECV, just remove it silently.
434                  * There is no good way to pass the error to the newly
435                  * created socket, and POSIX does not want network
436                  * errors returned from accept().
437                  */
438                 inet_csk_reqsk_queue_drop(sk, req, prev);
439                 goto out;
440
441         case TCP_SYN_SENT:
442         case TCP_SYN_RECV:  /* Cannot happen.
443                                It can f.e. if SYNs crossed.
444                              */
445                 if (!sock_owned_by_user(sk)) {
446                         sk->sk_err = err;
447
448                         sk->sk_error_report(sk);
449
450                         tcp_done(sk);
451                 } else {
452                         sk->sk_err_soft = err;
453                 }
454                 goto out;
455         }
456
457         /* If we've already connected we will keep trying
458          * until we time out, or the user gives up.
459          *
460          * rfc1122 4.2.3.9 allows to consider as hard errors
461          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
462          * but it is obsoleted by pmtu discovery).
463          *
464          * Note, that in modern internet, where routing is unreliable
465          * and in each dark corner broken firewalls sit, sending random
466          * errors ordered by their masters even this two messages finally lose
467          * their original sense (even Linux sends invalid PORT_UNREACHs)
468          *
469          * Now we are in compliance with RFCs.
470          *                                                      --ANK (980905)
471          */
472
473         inet = inet_sk(sk);
474         if (!sock_owned_by_user(sk) && inet->recverr) {
475                 sk->sk_err = err;
476                 sk->sk_error_report(sk);
477         } else  { /* Only an error on timeout */
478                 sk->sk_err_soft = err;
479         }
480
481 out:
482         bh_unlock_sock(sk);
483         sock_put(sk);
484 }
485
486 /* This routine computes an IPv4 TCP checksum. */
487 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
488 {
489         struct inet_sock *inet = inet_sk(sk);
490         struct tcphdr *th = tcp_hdr(skb);
491
492         if (skb->ip_summed == CHECKSUM_PARTIAL) {
493                 th->check = ~tcp_v4_check(len, inet->saddr,
494                                           inet->daddr, 0);
495                 skb->csum_start = skb_transport_header(skb) - skb->head;
496                 skb->csum_offset = offsetof(struct tcphdr, check);
497         } else {
498                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
499                                          csum_partial((char *)th,
500                                                       th->doff << 2,
501                                                       skb->csum));
502         }
503 }
504
505 int tcp_v4_gso_send_check(struct sk_buff *skb)
506 {
507         const struct iphdr *iph;
508         struct tcphdr *th;
509
510         if (!pskb_may_pull(skb, sizeof(*th)))
511                 return -EINVAL;
512
513         iph = ip_hdr(skb);
514         th = tcp_hdr(skb);
515
516         th->check = 0;
517         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
518         skb->csum_start = skb_transport_header(skb) - skb->head;
519         skb->csum_offset = offsetof(struct tcphdr, check);
520         skb->ip_summed = CHECKSUM_PARTIAL;
521         return 0;
522 }
523
524 /*
525  *      This routine will send an RST to the other tcp.
526  *
527  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
528  *                    for reset.
529  *      Answer: if a packet caused RST, it is not for a socket
530  *              existing in our system, if it is matched to a socket,
531  *              it is just duplicate segment or bug in other side's TCP.
532  *              So that we build reply only basing on parameters
533  *              arrived with segment.
534  *      Exception: precedence violation. We do not implement it in any case.
535  */
536
537 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
538 {
539         struct tcphdr *th = tcp_hdr(skb);
540         struct {
541                 struct tcphdr th;
542 #ifdef CONFIG_TCP_MD5SIG
543                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
544 #endif
545         } rep;
546         struct ip_reply_arg arg;
547 #ifdef CONFIG_TCP_MD5SIG
548         struct tcp_md5sig_key *key;
549 #endif
550
551         /* Never send a reset in response to a reset. */
552         if (th->rst)
553                 return;
554
555         if (skb->rtable->rt_type != RTN_LOCAL)
556                 return;
557
558         /* Swap the send and the receive. */
559         memset(&rep, 0, sizeof(rep));
560         rep.th.dest   = th->source;
561         rep.th.source = th->dest;
562         rep.th.doff   = sizeof(struct tcphdr) / 4;
563         rep.th.rst    = 1;
564
565         if (th->ack) {
566                 rep.th.seq = th->ack_seq;
567         } else {
568                 rep.th.ack = 1;
569                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
570                                        skb->len - (th->doff << 2));
571         }
572
573         memset(&arg, 0, sizeof(arg));
574         arg.iov[0].iov_base = (unsigned char *)&rep;
575         arg.iov[0].iov_len  = sizeof(rep.th);
576
577 #ifdef CONFIG_TCP_MD5SIG
578         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
579         if (key) {
580                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
581                                    (TCPOPT_NOP << 16) |
582                                    (TCPOPT_MD5SIG << 8) |
583                                    TCPOLEN_MD5SIG);
584                 /* Update length and the length the header thinks exists */
585                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
586                 rep.th.doff = arg.iov[0].iov_len / 4;
587
588                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
589                                         key,
590                                         ip_hdr(skb)->daddr,
591                                         ip_hdr(skb)->saddr,
592                                         &rep.th, arg.iov[0].iov_len);
593         }
594 #endif
595         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
596                                       ip_hdr(skb)->saddr, /* XXX */
597                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
598         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
599
600         ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb,
601                       &arg, arg.iov[0].iov_len);
602
603         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
604         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
605 }
606
607 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
608    outside socket context is ugly, certainly. What can I do?
609  */
610
611 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
612                             u32 win, u32 ts, int oif,
613                             struct tcp_md5sig_key *key)
614 {
615         struct tcphdr *th = tcp_hdr(skb);
616         struct {
617                 struct tcphdr th;
618                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
619 #ifdef CONFIG_TCP_MD5SIG
620                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
621 #endif
622                         ];
623         } rep;
624         struct ip_reply_arg arg;
625
626         memset(&rep.th, 0, sizeof(struct tcphdr));
627         memset(&arg, 0, sizeof(arg));
628
629         arg.iov[0].iov_base = (unsigned char *)&rep;
630         arg.iov[0].iov_len  = sizeof(rep.th);
631         if (ts) {
632                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
633                                    (TCPOPT_TIMESTAMP << 8) |
634                                    TCPOLEN_TIMESTAMP);
635                 rep.opt[1] = htonl(tcp_time_stamp);
636                 rep.opt[2] = htonl(ts);
637                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
638         }
639
640         /* Swap the send and the receive. */
641         rep.th.dest    = th->source;
642         rep.th.source  = th->dest;
643         rep.th.doff    = arg.iov[0].iov_len / 4;
644         rep.th.seq     = htonl(seq);
645         rep.th.ack_seq = htonl(ack);
646         rep.th.ack     = 1;
647         rep.th.window  = htons(win);
648
649 #ifdef CONFIG_TCP_MD5SIG
650         if (key) {
651                 int offset = (ts) ? 3 : 0;
652
653                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
654                                           (TCPOPT_NOP << 16) |
655                                           (TCPOPT_MD5SIG << 8) |
656                                           TCPOLEN_MD5SIG);
657                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
658                 rep.th.doff = arg.iov[0].iov_len/4;
659
660                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
661                                         key,
662                                         ip_hdr(skb)->daddr,
663                                         ip_hdr(skb)->saddr,
664                                         &rep.th, arg.iov[0].iov_len);
665         }
666 #endif
667         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
668                                       ip_hdr(skb)->saddr, /* XXX */
669                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
670         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
671         if (oif)
672                 arg.bound_dev_if = oif;
673
674         ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb,
675                       &arg, arg.iov[0].iov_len);
676
677         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
678 }
679
680 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
681 {
682         struct inet_timewait_sock *tw = inet_twsk(sk);
683         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
684
685         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
686                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
687                         tcptw->tw_ts_recent,
688                         tw->tw_bound_dev_if,
689                         tcp_twsk_md5_key(tcptw)
690                         );
691
692         inet_twsk_put(tw);
693 }
694
695 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
696                                   struct request_sock *req)
697 {
698         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
699                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
700                         req->ts_recent,
701                         0,
702                         tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr));
703 }
704
705 /*
706  *      Send a SYN-ACK after having received a SYN.
707  *      This still operates on a request_sock only, not on a big
708  *      socket.
709  */
710 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
711                                 struct dst_entry *dst)
712 {
713         const struct inet_request_sock *ireq = inet_rsk(req);
714         int err = -1;
715         struct sk_buff * skb;
716
717         /* First, grab a route. */
718         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
719                 return -1;
720
721         skb = tcp_make_synack(sk, dst, req);
722
723         if (skb) {
724                 struct tcphdr *th = tcp_hdr(skb);
725
726                 th->check = tcp_v4_check(skb->len,
727                                          ireq->loc_addr,
728                                          ireq->rmt_addr,
729                                          csum_partial((char *)th, skb->len,
730                                                       skb->csum));
731
732                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
733                                             ireq->rmt_addr,
734                                             ireq->opt);
735                 err = net_xmit_eval(err);
736         }
737
738         dst_release(dst);
739         return err;
740 }
741
742 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
743 {
744         return __tcp_v4_send_synack(sk, req, NULL);
745 }
746
747 /*
748  *      IPv4 request_sock destructor.
749  */
750 static void tcp_v4_reqsk_destructor(struct request_sock *req)
751 {
752         kfree(inet_rsk(req)->opt);
753 }
754
755 #ifdef CONFIG_SYN_COOKIES
756 static void syn_flood_warning(struct sk_buff *skb)
757 {
758         static unsigned long warntime;
759
760         if (time_after(jiffies, (warntime + HZ * 60))) {
761                 warntime = jiffies;
762                 printk(KERN_INFO
763                        "possible SYN flooding on port %d. Sending cookies.\n",
764                        ntohs(tcp_hdr(skb)->dest));
765         }
766 }
767 #endif
768
769 /*
770  * Save and compile IPv4 options into the request_sock if needed.
771  */
772 static struct ip_options *tcp_v4_save_options(struct sock *sk,
773                                               struct sk_buff *skb)
774 {
775         struct ip_options *opt = &(IPCB(skb)->opt);
776         struct ip_options *dopt = NULL;
777
778         if (opt && opt->optlen) {
779                 int opt_size = optlength(opt);
780                 dopt = kmalloc(opt_size, GFP_ATOMIC);
781                 if (dopt) {
782                         if (ip_options_echo(dopt, skb)) {
783                                 kfree(dopt);
784                                 dopt = NULL;
785                         }
786                 }
787         }
788         return dopt;
789 }
790
791 #ifdef CONFIG_TCP_MD5SIG
792 /*
793  * RFC2385 MD5 checksumming requires a mapping of
794  * IP address->MD5 Key.
795  * We need to maintain these in the sk structure.
796  */
797
798 /* Find the Key structure for an address.  */
799 static struct tcp_md5sig_key *
800                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
801 {
802         struct tcp_sock *tp = tcp_sk(sk);
803         int i;
804
805         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
806                 return NULL;
807         for (i = 0; i < tp->md5sig_info->entries4; i++) {
808                 if (tp->md5sig_info->keys4[i].addr == addr)
809                         return &tp->md5sig_info->keys4[i].base;
810         }
811         return NULL;
812 }
813
814 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
815                                          struct sock *addr_sk)
816 {
817         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
818 }
819
820 EXPORT_SYMBOL(tcp_v4_md5_lookup);
821
822 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
823                                                       struct request_sock *req)
824 {
825         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
826 }
827
828 /* This can be called on a newly created socket, from other files */
829 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
830                       u8 *newkey, u8 newkeylen)
831 {
832         /* Add Key to the list */
833         struct tcp_md5sig_key *key;
834         struct tcp_sock *tp = tcp_sk(sk);
835         struct tcp4_md5sig_key *keys;
836
837         key = tcp_v4_md5_do_lookup(sk, addr);
838         if (key) {
839                 /* Pre-existing entry - just update that one. */
840                 kfree(key->key);
841                 key->key = newkey;
842                 key->keylen = newkeylen;
843         } else {
844                 struct tcp_md5sig_info *md5sig;
845
846                 if (!tp->md5sig_info) {
847                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
848                                                   GFP_ATOMIC);
849                         if (!tp->md5sig_info) {
850                                 kfree(newkey);
851                                 return -ENOMEM;
852                         }
853                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
854                 }
855                 if (tcp_alloc_md5sig_pool() == NULL) {
856                         kfree(newkey);
857                         return -ENOMEM;
858                 }
859                 md5sig = tp->md5sig_info;
860
861                 if (md5sig->alloced4 == md5sig->entries4) {
862                         keys = kmalloc((sizeof(*keys) *
863                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
864                         if (!keys) {
865                                 kfree(newkey);
866                                 tcp_free_md5sig_pool();
867                                 return -ENOMEM;
868                         }
869
870                         if (md5sig->entries4)
871                                 memcpy(keys, md5sig->keys4,
872                                        sizeof(*keys) * md5sig->entries4);
873
874                         /* Free old key list, and reference new one */
875                         kfree(md5sig->keys4);
876                         md5sig->keys4 = keys;
877                         md5sig->alloced4++;
878                 }
879                 md5sig->entries4++;
880                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
881                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
882                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
883         }
884         return 0;
885 }
886
887 EXPORT_SYMBOL(tcp_v4_md5_do_add);
888
889 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
890                                u8 *newkey, u8 newkeylen)
891 {
892         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
893                                  newkey, newkeylen);
894 }
895
896 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
897 {
898         struct tcp_sock *tp = tcp_sk(sk);
899         int i;
900
901         for (i = 0; i < tp->md5sig_info->entries4; i++) {
902                 if (tp->md5sig_info->keys4[i].addr == addr) {
903                         /* Free the key */
904                         kfree(tp->md5sig_info->keys4[i].base.key);
905                         tp->md5sig_info->entries4--;
906
907                         if (tp->md5sig_info->entries4 == 0) {
908                                 kfree(tp->md5sig_info->keys4);
909                                 tp->md5sig_info->keys4 = NULL;
910                                 tp->md5sig_info->alloced4 = 0;
911                         } else if (tp->md5sig_info->entries4 != i) {
912                                 /* Need to do some manipulation */
913                                 memmove(&tp->md5sig_info->keys4[i],
914                                         &tp->md5sig_info->keys4[i+1],
915                                         (tp->md5sig_info->entries4 - i) *
916                                          sizeof(struct tcp4_md5sig_key));
917                         }
918                         tcp_free_md5sig_pool();
919                         return 0;
920                 }
921         }
922         return -ENOENT;
923 }
924
925 EXPORT_SYMBOL(tcp_v4_md5_do_del);
926
927 static void tcp_v4_clear_md5_list(struct sock *sk)
928 {
929         struct tcp_sock *tp = tcp_sk(sk);
930
931         /* Free each key, then the set of key keys,
932          * the crypto element, and then decrement our
933          * hold on the last resort crypto.
934          */
935         if (tp->md5sig_info->entries4) {
936                 int i;
937                 for (i = 0; i < tp->md5sig_info->entries4; i++)
938                         kfree(tp->md5sig_info->keys4[i].base.key);
939                 tp->md5sig_info->entries4 = 0;
940                 tcp_free_md5sig_pool();
941         }
942         if (tp->md5sig_info->keys4) {
943                 kfree(tp->md5sig_info->keys4);
944                 tp->md5sig_info->keys4 = NULL;
945                 tp->md5sig_info->alloced4  = 0;
946         }
947 }
948
949 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
950                                  int optlen)
951 {
952         struct tcp_md5sig cmd;
953         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
954         u8 *newkey;
955
956         if (optlen < sizeof(cmd))
957                 return -EINVAL;
958
959         if (copy_from_user(&cmd, optval, sizeof(cmd)))
960                 return -EFAULT;
961
962         if (sin->sin_family != AF_INET)
963                 return -EINVAL;
964
965         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
966                 if (!tcp_sk(sk)->md5sig_info)
967                         return -ENOENT;
968                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
969         }
970
971         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
972                 return -EINVAL;
973
974         if (!tcp_sk(sk)->md5sig_info) {
975                 struct tcp_sock *tp = tcp_sk(sk);
976                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
977
978                 if (!p)
979                         return -EINVAL;
980
981                 tp->md5sig_info = p;
982                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
983         }
984
985         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
986         if (!newkey)
987                 return -ENOMEM;
988         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
989                                  newkey, cmd.tcpm_keylen);
990 }
991
992 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
993                                    __be32 saddr, __be32 daddr,
994                                    struct tcphdr *th,
995                                    unsigned int tcplen)
996 {
997         struct tcp_md5sig_pool *hp;
998         struct tcp4_pseudohdr *bp;
999         int err;
1000
1001         /*
1002          * Okay, so RFC2385 is turned on for this connection,
1003          * so we need to generate the MD5 hash for the packet now.
1004          */
1005
1006         hp = tcp_get_md5sig_pool();
1007         if (!hp)
1008                 goto clear_hash_noput;
1009
1010         bp = &hp->md5_blk.ip4;
1011
1012         /*
1013          * The TCP pseudo-header (in the order: source IP address,
1014          * destination IP address, zero-padded protocol number, and
1015          * segment length)
1016          */
1017         bp->saddr = saddr;
1018         bp->daddr = daddr;
1019         bp->pad = 0;
1020         bp->protocol = IPPROTO_TCP;
1021         bp->len = htons(tcplen);
1022
1023         err = tcp_calc_md5_hash(md5_hash, key, sizeof(*bp),
1024                                 th, tcplen, hp);
1025         if (err)
1026                 goto clear_hash;
1027
1028         /* Free up the crypto pool */
1029         tcp_put_md5sig_pool();
1030 out:
1031         return 0;
1032 clear_hash:
1033         tcp_put_md5sig_pool();
1034 clear_hash_noput:
1035         memset(md5_hash, 0, 16);
1036         goto out;
1037 }
1038
1039 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1040                          struct sock *sk,
1041                          struct dst_entry *dst,
1042                          struct request_sock *req,
1043                          struct tcphdr *th,
1044                          unsigned int tcplen)
1045 {
1046         __be32 saddr, daddr;
1047
1048         if (sk) {
1049                 saddr = inet_sk(sk)->saddr;
1050                 daddr = inet_sk(sk)->daddr;
1051         } else {
1052                 struct rtable *rt = (struct rtable *)dst;
1053                 BUG_ON(!rt);
1054                 saddr = rt->rt_src;
1055                 daddr = rt->rt_dst;
1056         }
1057         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1058                                        saddr, daddr,
1059                                        th, tcplen);
1060 }
1061
1062 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1063
1064 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1065 {
1066         /*
1067          * This gets called for each TCP segment that arrives
1068          * so we want to be efficient.
1069          * We have 3 drop cases:
1070          * o No MD5 hash and one expected.
1071          * o MD5 hash and we're not expecting one.
1072          * o MD5 hash and its wrong.
1073          */
1074         __u8 *hash_location = NULL;
1075         struct tcp_md5sig_key *hash_expected;
1076         const struct iphdr *iph = ip_hdr(skb);
1077         struct tcphdr *th = tcp_hdr(skb);
1078         int genhash;
1079         unsigned char newhash[16];
1080
1081         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1082         hash_location = tcp_parse_md5sig_option(th);
1083
1084         /* We've parsed the options - do we have a hash? */
1085         if (!hash_expected && !hash_location)
1086                 return 0;
1087
1088         if (hash_expected && !hash_location) {
1089                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1090                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1091                                NIPQUAD(iph->saddr), ntohs(th->source),
1092                                NIPQUAD(iph->daddr), ntohs(th->dest));
1093                 return 1;
1094         }
1095
1096         if (!hash_expected && hash_location) {
1097                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1098                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1099                                NIPQUAD(iph->saddr), ntohs(th->source),
1100                                NIPQUAD(iph->daddr), ntohs(th->dest));
1101                 return 1;
1102         }
1103
1104         /* Okay, so this is hash_expected and hash_location -
1105          * so we need to calculate the checksum.
1106          */
1107         genhash = tcp_v4_do_calc_md5_hash(newhash,
1108                                           hash_expected,
1109                                           iph->saddr, iph->daddr,
1110                                           th, skb->len);
1111
1112         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1113                 if (net_ratelimit()) {
1114                         printk(KERN_INFO "MD5 Hash failed for "
1115                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1116                                NIPQUAD(iph->saddr), ntohs(th->source),
1117                                NIPQUAD(iph->daddr), ntohs(th->dest),
1118                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1119                 }
1120                 return 1;
1121         }
1122         return 0;
1123 }
1124
1125 #endif
1126
1127 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1128         .family         =       PF_INET,
1129         .obj_size       =       sizeof(struct tcp_request_sock),
1130         .rtx_syn_ack    =       tcp_v4_send_synack,
1131         .send_ack       =       tcp_v4_reqsk_send_ack,
1132         .destructor     =       tcp_v4_reqsk_destructor,
1133         .send_reset     =       tcp_v4_send_reset,
1134 };
1135
1136 #ifdef CONFIG_TCP_MD5SIG
1137 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1138         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1139 };
1140 #endif
1141
1142 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1143         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1144         .twsk_unique    = tcp_twsk_unique,
1145         .twsk_destructor= tcp_twsk_destructor,
1146 };
1147
1148 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1149 {
1150         struct inet_request_sock *ireq;
1151         struct tcp_options_received tmp_opt;
1152         struct request_sock *req;
1153         __be32 saddr = ip_hdr(skb)->saddr;
1154         __be32 daddr = ip_hdr(skb)->daddr;
1155         __u32 isn = TCP_SKB_CB(skb)->when;
1156         struct dst_entry *dst = NULL;
1157 #ifdef CONFIG_SYN_COOKIES
1158         int want_cookie = 0;
1159 #else
1160 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1161 #endif
1162
1163         /* Never answer to SYNs send to broadcast or multicast */
1164         if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1165                 goto drop;
1166
1167         /* TW buckets are converted to open requests without
1168          * limitations, they conserve resources and peer is
1169          * evidently real one.
1170          */
1171         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1172 #ifdef CONFIG_SYN_COOKIES
1173                 if (sysctl_tcp_syncookies) {
1174                         want_cookie = 1;
1175                 } else
1176 #endif
1177                 goto drop;
1178         }
1179
1180         /* Accept backlog is full. If we have already queued enough
1181          * of warm entries in syn queue, drop request. It is better than
1182          * clogging syn queue with openreqs with exponentially increasing
1183          * timeout.
1184          */
1185         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1186                 goto drop;
1187
1188         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1189         if (!req)
1190                 goto drop;
1191
1192 #ifdef CONFIG_TCP_MD5SIG
1193         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1194 #endif
1195
1196         tcp_clear_options(&tmp_opt);
1197         tmp_opt.mss_clamp = 536;
1198         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1199
1200         tcp_parse_options(skb, &tmp_opt, 0);
1201
1202         if (want_cookie && !tmp_opt.saw_tstamp)
1203                 tcp_clear_options(&tmp_opt);
1204
1205         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1206                 /* Some OSes (unknown ones, but I see them on web server, which
1207                  * contains information interesting only for windows'
1208                  * users) do not send their stamp in SYN. It is easy case.
1209                  * We simply do not advertise TS support.
1210                  */
1211                 tmp_opt.saw_tstamp = 0;
1212                 tmp_opt.tstamp_ok  = 0;
1213         }
1214         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1215
1216         tcp_openreq_init(req, &tmp_opt, skb);
1217
1218         if (security_inet_conn_request(sk, skb, req))
1219                 goto drop_and_free;
1220
1221         ireq = inet_rsk(req);
1222         ireq->loc_addr = daddr;
1223         ireq->rmt_addr = saddr;
1224         ireq->opt = tcp_v4_save_options(sk, skb);
1225         if (!want_cookie)
1226                 TCP_ECN_create_request(req, tcp_hdr(skb));
1227
1228         if (want_cookie) {
1229 #ifdef CONFIG_SYN_COOKIES
1230                 syn_flood_warning(skb);
1231                 req->cookie_ts = tmp_opt.tstamp_ok;
1232 #endif
1233                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1234         } else if (!isn) {
1235                 struct inet_peer *peer = NULL;
1236
1237                 /* VJ's idea. We save last timestamp seen
1238                  * from the destination in peer table, when entering
1239                  * state TIME-WAIT, and check against it before
1240                  * accepting new connection request.
1241                  *
1242                  * If "isn" is not zero, this request hit alive
1243                  * timewait bucket, so that all the necessary checks
1244                  * are made in the function processing timewait state.
1245                  */
1246                 if (tmp_opt.saw_tstamp &&
1247                     tcp_death_row.sysctl_tw_recycle &&
1248                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1249                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1250                     peer->v4daddr == saddr) {
1251                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1252                             (s32)(peer->tcp_ts - req->ts_recent) >
1253                                                         TCP_PAWS_WINDOW) {
1254                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1255                                 goto drop_and_release;
1256                         }
1257                 }
1258                 /* Kill the following clause, if you dislike this way. */
1259                 else if (!sysctl_tcp_syncookies &&
1260                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1261                           (sysctl_max_syn_backlog >> 2)) &&
1262                          (!peer || !peer->tcp_ts_stamp) &&
1263                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1264                         /* Without syncookies last quarter of
1265                          * backlog is filled with destinations,
1266                          * proven to be alive.
1267                          * It means that we continue to communicate
1268                          * to destinations, already remembered
1269                          * to the moment of synflood.
1270                          */
1271                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1272                                        "request from " NIPQUAD_FMT "/%u\n",
1273                                        NIPQUAD(saddr),
1274                                        ntohs(tcp_hdr(skb)->source));
1275                         goto drop_and_release;
1276                 }
1277
1278                 isn = tcp_v4_init_sequence(skb);
1279         }
1280         tcp_rsk(req)->snt_isn = isn;
1281
1282         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1283                 goto drop_and_free;
1284
1285         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1286         return 0;
1287
1288 drop_and_release:
1289         dst_release(dst);
1290 drop_and_free:
1291         reqsk_free(req);
1292 drop:
1293         return 0;
1294 }
1295
1296
1297 /*
1298  * The three way handshake has completed - we got a valid synack -
1299  * now create the new socket.
1300  */
1301 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1302                                   struct request_sock *req,
1303                                   struct dst_entry *dst)
1304 {
1305         struct inet_request_sock *ireq;
1306         struct inet_sock *newinet;
1307         struct tcp_sock *newtp;
1308         struct sock *newsk;
1309 #ifdef CONFIG_TCP_MD5SIG
1310         struct tcp_md5sig_key *key;
1311 #endif
1312
1313         if (sk_acceptq_is_full(sk))
1314                 goto exit_overflow;
1315
1316         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1317                 goto exit;
1318
1319         newsk = tcp_create_openreq_child(sk, req, skb);
1320         if (!newsk)
1321                 goto exit;
1322
1323         newsk->sk_gso_type = SKB_GSO_TCPV4;
1324         sk_setup_caps(newsk, dst);
1325
1326         newtp                 = tcp_sk(newsk);
1327         newinet               = inet_sk(newsk);
1328         ireq                  = inet_rsk(req);
1329         newinet->daddr        = ireq->rmt_addr;
1330         newinet->rcv_saddr    = ireq->loc_addr;
1331         newinet->saddr        = ireq->loc_addr;
1332         newinet->opt          = ireq->opt;
1333         ireq->opt             = NULL;
1334         newinet->mc_index     = inet_iif(skb);
1335         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1336         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1337         if (newinet->opt)
1338                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1339         newinet->id = newtp->write_seq ^ jiffies;
1340
1341         tcp_mtup_init(newsk);
1342         tcp_sync_mss(newsk, dst_mtu(dst));
1343         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1344         tcp_initialize_rcv_mss(newsk);
1345
1346 #ifdef CONFIG_TCP_MD5SIG
1347         /* Copy over the MD5 key from the original socket */
1348         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1349                 /*
1350                  * We're using one, so create a matching key
1351                  * on the newsk structure. If we fail to get
1352                  * memory, then we end up not copying the key
1353                  * across. Shucks.
1354                  */
1355                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1356                 if (newkey != NULL)
1357                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1358                                           newkey, key->keylen);
1359         }
1360 #endif
1361
1362         __inet_hash_nolisten(newsk);
1363         __inet_inherit_port(sk, newsk);
1364
1365         return newsk;
1366
1367 exit_overflow:
1368         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1369 exit:
1370         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1371         dst_release(dst);
1372         return NULL;
1373 }
1374
1375 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1376 {
1377         struct tcphdr *th = tcp_hdr(skb);
1378         const struct iphdr *iph = ip_hdr(skb);
1379         struct sock *nsk;
1380         struct request_sock **prev;
1381         /* Find possible connection requests. */
1382         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1383                                                        iph->saddr, iph->daddr);
1384         if (req)
1385                 return tcp_check_req(sk, skb, req, prev);
1386
1387         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1388                         th->source, iph->daddr, th->dest, inet_iif(skb));
1389
1390         if (nsk) {
1391                 if (nsk->sk_state != TCP_TIME_WAIT) {
1392                         bh_lock_sock(nsk);
1393                         return nsk;
1394                 }
1395                 inet_twsk_put(inet_twsk(nsk));
1396                 return NULL;
1397         }
1398
1399 #ifdef CONFIG_SYN_COOKIES
1400         if (!th->rst && !th->syn && th->ack)
1401                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1402 #endif
1403         return sk;
1404 }
1405
1406 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1407 {
1408         const struct iphdr *iph = ip_hdr(skb);
1409
1410         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1411                 if (!tcp_v4_check(skb->len, iph->saddr,
1412                                   iph->daddr, skb->csum)) {
1413                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1414                         return 0;
1415                 }
1416         }
1417
1418         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1419                                        skb->len, IPPROTO_TCP, 0);
1420
1421         if (skb->len <= 76) {
1422                 return __skb_checksum_complete(skb);
1423         }
1424         return 0;
1425 }
1426
1427
1428 /* The socket must have it's spinlock held when we get
1429  * here.
1430  *
1431  * We have a potential double-lock case here, so even when
1432  * doing backlog processing we use the BH locking scheme.
1433  * This is because we cannot sleep with the original spinlock
1434  * held.
1435  */
1436 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1437 {
1438         struct sock *rsk;
1439 #ifdef CONFIG_TCP_MD5SIG
1440         /*
1441          * We really want to reject the packet as early as possible
1442          * if:
1443          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1444          *  o There is an MD5 option and we're not expecting one
1445          */
1446         if (tcp_v4_inbound_md5_hash(sk, skb))
1447                 goto discard;
1448 #endif
1449
1450         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1451                 TCP_CHECK_TIMER(sk);
1452                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1453                         rsk = sk;
1454                         goto reset;
1455                 }
1456                 TCP_CHECK_TIMER(sk);
1457                 return 0;
1458         }
1459
1460         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1461                 goto csum_err;
1462
1463         if (sk->sk_state == TCP_LISTEN) {
1464                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1465                 if (!nsk)
1466                         goto discard;
1467
1468                 if (nsk != sk) {
1469                         if (tcp_child_process(sk, nsk, skb)) {
1470                                 rsk = nsk;
1471                                 goto reset;
1472                         }
1473                         return 0;
1474                 }
1475         }
1476
1477         TCP_CHECK_TIMER(sk);
1478         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1479                 rsk = sk;
1480                 goto reset;
1481         }
1482         TCP_CHECK_TIMER(sk);
1483         return 0;
1484
1485 reset:
1486         tcp_v4_send_reset(rsk, skb);
1487 discard:
1488         kfree_skb(skb);
1489         /* Be careful here. If this function gets more complicated and
1490          * gcc suffers from register pressure on the x86, sk (in %ebx)
1491          * might be destroyed here. This current version compiles correctly,
1492          * but you have been warned.
1493          */
1494         return 0;
1495
1496 csum_err:
1497         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1498         goto discard;
1499 }
1500
1501 /*
1502  *      From tcp_input.c
1503  */
1504
1505 int tcp_v4_rcv(struct sk_buff *skb)
1506 {
1507         const struct iphdr *iph;
1508         struct tcphdr *th;
1509         struct sock *sk;
1510         int ret;
1511
1512         if (skb->pkt_type != PACKET_HOST)
1513                 goto discard_it;
1514
1515         /* Count it even if it's bad */
1516         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1517
1518         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1519                 goto discard_it;
1520
1521         th = tcp_hdr(skb);
1522
1523         if (th->doff < sizeof(struct tcphdr) / 4)
1524                 goto bad_packet;
1525         if (!pskb_may_pull(skb, th->doff * 4))
1526                 goto discard_it;
1527
1528         /* An explanation is required here, I think.
1529          * Packet length and doff are validated by header prediction,
1530          * provided case of th->doff==0 is eliminated.
1531          * So, we defer the checks. */
1532         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1533                 goto bad_packet;
1534
1535         th = tcp_hdr(skb);
1536         iph = ip_hdr(skb);
1537         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1538         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1539                                     skb->len - th->doff * 4);
1540         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1541         TCP_SKB_CB(skb)->when    = 0;
1542         TCP_SKB_CB(skb)->flags   = iph->tos;
1543         TCP_SKB_CB(skb)->sacked  = 0;
1544
1545         sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr,
1546                         th->source, iph->daddr, th->dest, inet_iif(skb));
1547         if (!sk)
1548                 goto no_tcp_socket;
1549
1550 process:
1551         if (sk->sk_state == TCP_TIME_WAIT)
1552                 goto do_time_wait;
1553
1554         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1555                 goto discard_and_relse;
1556         nf_reset(skb);
1557
1558         if (sk_filter(sk, skb))
1559                 goto discard_and_relse;
1560
1561         skb->dev = NULL;
1562
1563         bh_lock_sock_nested(sk);
1564         ret = 0;
1565         if (!sock_owned_by_user(sk)) {
1566 #ifdef CONFIG_NET_DMA
1567                 struct tcp_sock *tp = tcp_sk(sk);
1568                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1569                         tp->ucopy.dma_chan = get_softnet_dma();
1570                 if (tp->ucopy.dma_chan)
1571                         ret = tcp_v4_do_rcv(sk, skb);
1572                 else
1573 #endif
1574                 {
1575                         if (!tcp_prequeue(sk, skb))
1576                         ret = tcp_v4_do_rcv(sk, skb);
1577                 }
1578         } else
1579                 sk_add_backlog(sk, skb);
1580         bh_unlock_sock(sk);
1581
1582         sock_put(sk);
1583
1584         return ret;
1585
1586 no_tcp_socket:
1587         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1588                 goto discard_it;
1589
1590         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1591 bad_packet:
1592                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1593         } else {
1594                 tcp_v4_send_reset(NULL, skb);
1595         }
1596
1597 discard_it:
1598         /* Discard frame. */
1599         kfree_skb(skb);
1600         return 0;
1601
1602 discard_and_relse:
1603         sock_put(sk);
1604         goto discard_it;
1605
1606 do_time_wait:
1607         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1608                 inet_twsk_put(inet_twsk(sk));
1609                 goto discard_it;
1610         }
1611
1612         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1613                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1614                 inet_twsk_put(inet_twsk(sk));
1615                 goto discard_it;
1616         }
1617         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1618         case TCP_TW_SYN: {
1619                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1620                                                         &tcp_hashinfo,
1621                                                         iph->daddr, th->dest,
1622                                                         inet_iif(skb));
1623                 if (sk2) {
1624                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1625                         inet_twsk_put(inet_twsk(sk));
1626                         sk = sk2;
1627                         goto process;
1628                 }
1629                 /* Fall through to ACK */
1630         }
1631         case TCP_TW_ACK:
1632                 tcp_v4_timewait_ack(sk, skb);
1633                 break;
1634         case TCP_TW_RST:
1635                 goto no_tcp_socket;
1636         case TCP_TW_SUCCESS:;
1637         }
1638         goto discard_it;
1639 }
1640
1641 /* VJ's idea. Save last timestamp seen from this destination
1642  * and hold it at least for normal timewait interval to use for duplicate
1643  * segment detection in subsequent connections, before they enter synchronized
1644  * state.
1645  */
1646
1647 int tcp_v4_remember_stamp(struct sock *sk)
1648 {
1649         struct inet_sock *inet = inet_sk(sk);
1650         struct tcp_sock *tp = tcp_sk(sk);
1651         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1652         struct inet_peer *peer = NULL;
1653         int release_it = 0;
1654
1655         if (!rt || rt->rt_dst != inet->daddr) {
1656                 peer = inet_getpeer(inet->daddr, 1);
1657                 release_it = 1;
1658         } else {
1659                 if (!rt->peer)
1660                         rt_bind_peer(rt, 1);
1661                 peer = rt->peer;
1662         }
1663
1664         if (peer) {
1665                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1666                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1667                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1668                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1669                         peer->tcp_ts = tp->rx_opt.ts_recent;
1670                 }
1671                 if (release_it)
1672                         inet_putpeer(peer);
1673                 return 1;
1674         }
1675
1676         return 0;
1677 }
1678
1679 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1680 {
1681         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1682
1683         if (peer) {
1684                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1685
1686                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1687                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1688                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1689                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1690                         peer->tcp_ts       = tcptw->tw_ts_recent;
1691                 }
1692                 inet_putpeer(peer);
1693                 return 1;
1694         }
1695
1696         return 0;
1697 }
1698
1699 struct inet_connection_sock_af_ops ipv4_specific = {
1700         .queue_xmit        = ip_queue_xmit,
1701         .send_check        = tcp_v4_send_check,
1702         .rebuild_header    = inet_sk_rebuild_header,
1703         .conn_request      = tcp_v4_conn_request,
1704         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1705         .remember_stamp    = tcp_v4_remember_stamp,
1706         .net_header_len    = sizeof(struct iphdr),
1707         .setsockopt        = ip_setsockopt,
1708         .getsockopt        = ip_getsockopt,
1709         .addr2sockaddr     = inet_csk_addr2sockaddr,
1710         .sockaddr_len      = sizeof(struct sockaddr_in),
1711         .bind_conflict     = inet_csk_bind_conflict,
1712 #ifdef CONFIG_COMPAT
1713         .compat_setsockopt = compat_ip_setsockopt,
1714         .compat_getsockopt = compat_ip_getsockopt,
1715 #endif
1716 };
1717
1718 #ifdef CONFIG_TCP_MD5SIG
1719 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1720         .md5_lookup             = tcp_v4_md5_lookup,
1721         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1722         .md5_add                = tcp_v4_md5_add_func,
1723         .md5_parse              = tcp_v4_parse_md5_keys,
1724 };
1725 #endif
1726
1727 /* NOTE: A lot of things set to zero explicitly by call to
1728  *       sk_alloc() so need not be done here.
1729  */
1730 static int tcp_v4_init_sock(struct sock *sk)
1731 {
1732         struct inet_connection_sock *icsk = inet_csk(sk);
1733         struct tcp_sock *tp = tcp_sk(sk);
1734
1735         skb_queue_head_init(&tp->out_of_order_queue);
1736         tcp_init_xmit_timers(sk);
1737         tcp_prequeue_init(tp);
1738
1739         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1740         tp->mdev = TCP_TIMEOUT_INIT;
1741
1742         /* So many TCP implementations out there (incorrectly) count the
1743          * initial SYN frame in their delayed-ACK and congestion control
1744          * algorithms that we must have the following bandaid to talk
1745          * efficiently to them.  -DaveM
1746          */
1747         tp->snd_cwnd = 2;
1748
1749         /* See draft-stevens-tcpca-spec-01 for discussion of the
1750          * initialization of these values.
1751          */
1752         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1753         tp->snd_cwnd_clamp = ~0;
1754         tp->mss_cache = 536;
1755
1756         tp->reordering = sysctl_tcp_reordering;
1757         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1758
1759         sk->sk_state = TCP_CLOSE;
1760
1761         sk->sk_write_space = sk_stream_write_space;
1762         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1763
1764         icsk->icsk_af_ops = &ipv4_specific;
1765         icsk->icsk_sync_mss = tcp_sync_mss;
1766 #ifdef CONFIG_TCP_MD5SIG
1767         tp->af_specific = &tcp_sock_ipv4_specific;
1768 #endif
1769
1770         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1771         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1772
1773         atomic_inc(&tcp_sockets_allocated);
1774
1775         return 0;
1776 }
1777
1778 int tcp_v4_destroy_sock(struct sock *sk)
1779 {
1780         struct tcp_sock *tp = tcp_sk(sk);
1781
1782         tcp_clear_xmit_timers(sk);
1783
1784         tcp_cleanup_congestion_control(sk);
1785
1786         /* Cleanup up the write buffer. */
1787         tcp_write_queue_purge(sk);
1788
1789         /* Cleans up our, hopefully empty, out_of_order_queue. */
1790         __skb_queue_purge(&tp->out_of_order_queue);
1791
1792 #ifdef CONFIG_TCP_MD5SIG
1793         /* Clean up the MD5 key list, if any */
1794         if (tp->md5sig_info) {
1795                 tcp_v4_clear_md5_list(sk);
1796                 kfree(tp->md5sig_info);
1797                 tp->md5sig_info = NULL;
1798         }
1799 #endif
1800
1801 #ifdef CONFIG_NET_DMA
1802         /* Cleans up our sk_async_wait_queue */
1803         __skb_queue_purge(&sk->sk_async_wait_queue);
1804 #endif
1805
1806         /* Clean prequeue, it must be empty really */
1807         __skb_queue_purge(&tp->ucopy.prequeue);
1808
1809         /* Clean up a referenced TCP bind bucket. */
1810         if (inet_csk(sk)->icsk_bind_hash)
1811                 inet_put_port(sk);
1812
1813         /*
1814          * If sendmsg cached page exists, toss it.
1815          */
1816         if (sk->sk_sndmsg_page) {
1817                 __free_page(sk->sk_sndmsg_page);
1818                 sk->sk_sndmsg_page = NULL;
1819         }
1820
1821         atomic_dec(&tcp_sockets_allocated);
1822
1823         return 0;
1824 }
1825
1826 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1827
1828 #ifdef CONFIG_PROC_FS
1829 /* Proc filesystem TCP sock list dumping. */
1830
1831 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1832 {
1833         return hlist_empty(head) ? NULL :
1834                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1835 }
1836
1837 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1838 {
1839         return tw->tw_node.next ?
1840                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1841 }
1842
1843 static void *listening_get_next(struct seq_file *seq, void *cur)
1844 {
1845         struct inet_connection_sock *icsk;
1846         struct hlist_node *node;
1847         struct sock *sk = cur;
1848         struct tcp_iter_state* st = seq->private;
1849         struct net *net = seq_file_net(seq);
1850
1851         if (!sk) {
1852                 st->bucket = 0;
1853                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1854                 goto get_sk;
1855         }
1856
1857         ++st->num;
1858
1859         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1860                 struct request_sock *req = cur;
1861
1862                 icsk = inet_csk(st->syn_wait_sk);
1863                 req = req->dl_next;
1864                 while (1) {
1865                         while (req) {
1866                                 if (req->rsk_ops->family == st->family &&
1867                                     net_eq(sock_net(req->sk), net)) {
1868                                         cur = req;
1869                                         goto out;
1870                                 }
1871                                 req = req->dl_next;
1872                         }
1873                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1874                                 break;
1875 get_req:
1876                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1877                 }
1878                 sk        = sk_next(st->syn_wait_sk);
1879                 st->state = TCP_SEQ_STATE_LISTENING;
1880                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1881         } else {
1882                 icsk = inet_csk(sk);
1883                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1884                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1885                         goto start_req;
1886                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1887                 sk = sk_next(sk);
1888         }
1889 get_sk:
1890         sk_for_each_from(sk, node) {
1891                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1892                         cur = sk;
1893                         goto out;
1894                 }
1895                 icsk = inet_csk(sk);
1896                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1897                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1898 start_req:
1899                         st->uid         = sock_i_uid(sk);
1900                         st->syn_wait_sk = sk;
1901                         st->state       = TCP_SEQ_STATE_OPENREQ;
1902                         st->sbucket     = 0;
1903                         goto get_req;
1904                 }
1905                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1906         }
1907         if (++st->bucket < INET_LHTABLE_SIZE) {
1908                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1909                 goto get_sk;
1910         }
1911         cur = NULL;
1912 out:
1913         return cur;
1914 }
1915
1916 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1917 {
1918         void *rc = listening_get_next(seq, NULL);
1919
1920         while (rc && *pos) {
1921                 rc = listening_get_next(seq, rc);
1922                 --*pos;
1923         }
1924         return rc;
1925 }
1926
1927 static void *established_get_first(struct seq_file *seq)
1928 {
1929         struct tcp_iter_state* st = seq->private;
1930         struct net *net = seq_file_net(seq);
1931         void *rc = NULL;
1932
1933         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1934                 struct sock *sk;
1935                 struct hlist_node *node;
1936                 struct inet_timewait_sock *tw;
1937                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1938
1939                 read_lock_bh(lock);
1940                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1941                         if (sk->sk_family != st->family ||
1942                             !net_eq(sock_net(sk), net)) {
1943                                 continue;
1944                         }
1945                         rc = sk;
1946                         goto out;
1947                 }
1948                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1949                 inet_twsk_for_each(tw, node,
1950                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
1951                         if (tw->tw_family != st->family ||
1952                             !net_eq(twsk_net(tw), net)) {
1953                                 continue;
1954                         }
1955                         rc = tw;
1956                         goto out;
1957                 }
1958                 read_unlock_bh(lock);
1959                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1960         }
1961 out:
1962         return rc;
1963 }
1964
1965 static void *established_get_next(struct seq_file *seq, void *cur)
1966 {
1967         struct sock *sk = cur;
1968         struct inet_timewait_sock *tw;
1969         struct hlist_node *node;
1970         struct tcp_iter_state* st = seq->private;
1971         struct net *net = seq_file_net(seq);
1972
1973         ++st->num;
1974
1975         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1976                 tw = cur;
1977                 tw = tw_next(tw);
1978 get_tw:
1979                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
1980                         tw = tw_next(tw);
1981                 }
1982                 if (tw) {
1983                         cur = tw;
1984                         goto out;
1985                 }
1986                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1987                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1988
1989                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1990                         read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1991                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1992                 } else {
1993                         cur = NULL;
1994                         goto out;
1995                 }
1996         } else
1997                 sk = sk_next(sk);
1998
1999         sk_for_each_from(sk, node) {
2000                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2001                         goto found;
2002         }
2003
2004         st->state = TCP_SEQ_STATE_TIME_WAIT;
2005         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2006         goto get_tw;
2007 found:
2008         cur = sk;
2009 out:
2010         return cur;
2011 }
2012
2013 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2014 {
2015         void *rc = established_get_first(seq);
2016
2017         while (rc && pos) {
2018                 rc = established_get_next(seq, rc);
2019                 --pos;
2020         }
2021         return rc;
2022 }
2023
2024 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2025 {
2026         void *rc;
2027         struct tcp_iter_state* st = seq->private;
2028
2029         inet_listen_lock(&tcp_hashinfo);
2030         st->state = TCP_SEQ_STATE_LISTENING;
2031         rc        = listening_get_idx(seq, &pos);
2032
2033         if (!rc) {
2034                 inet_listen_unlock(&tcp_hashinfo);
2035                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2036                 rc        = established_get_idx(seq, pos);
2037         }
2038
2039         return rc;
2040 }
2041
2042 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2043 {
2044         struct tcp_iter_state* st = seq->private;
2045         st->state = TCP_SEQ_STATE_LISTENING;
2046         st->num = 0;
2047         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2048 }
2049
2050 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2051 {
2052         void *rc = NULL;
2053         struct tcp_iter_state* st;
2054
2055         if (v == SEQ_START_TOKEN) {
2056                 rc = tcp_get_idx(seq, 0);
2057                 goto out;
2058         }
2059         st = seq->private;
2060
2061         switch (st->state) {
2062         case TCP_SEQ_STATE_OPENREQ:
2063         case TCP_SEQ_STATE_LISTENING:
2064                 rc = listening_get_next(seq, v);
2065                 if (!rc) {
2066                         inet_listen_unlock(&tcp_hashinfo);
2067                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2068                         rc        = established_get_first(seq);
2069                 }
2070                 break;
2071         case TCP_SEQ_STATE_ESTABLISHED:
2072         case TCP_SEQ_STATE_TIME_WAIT:
2073                 rc = established_get_next(seq, v);
2074                 break;
2075         }
2076 out:
2077         ++*pos;
2078         return rc;
2079 }
2080
2081 static void tcp_seq_stop(struct seq_file *seq, void *v)
2082 {
2083         struct tcp_iter_state* st = seq->private;
2084
2085         switch (st->state) {
2086         case TCP_SEQ_STATE_OPENREQ:
2087                 if (v) {
2088                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2089                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2090                 }
2091         case TCP_SEQ_STATE_LISTENING:
2092                 if (v != SEQ_START_TOKEN)
2093                         inet_listen_unlock(&tcp_hashinfo);
2094                 break;
2095         case TCP_SEQ_STATE_TIME_WAIT:
2096         case TCP_SEQ_STATE_ESTABLISHED:
2097                 if (v)
2098                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2099                 break;
2100         }
2101 }
2102
2103 static int tcp_seq_open(struct inode *inode, struct file *file)
2104 {
2105         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2106         struct tcp_iter_state *s;
2107         int err;
2108
2109         err = seq_open_net(inode, file, &afinfo->seq_ops,
2110                           sizeof(struct tcp_iter_state));
2111         if (err < 0)
2112                 return err;
2113
2114         s = ((struct seq_file *)file->private_data)->private;
2115         s->family               = afinfo->family;
2116         return 0;
2117 }
2118
2119 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2120 {
2121         int rc = 0;
2122         struct proc_dir_entry *p;
2123
2124         afinfo->seq_fops.open           = tcp_seq_open;
2125         afinfo->seq_fops.read           = seq_read;
2126         afinfo->seq_fops.llseek         = seq_lseek;
2127         afinfo->seq_fops.release        = seq_release_net;
2128
2129         afinfo->seq_ops.start           = tcp_seq_start;
2130         afinfo->seq_ops.next            = tcp_seq_next;
2131         afinfo->seq_ops.stop            = tcp_seq_stop;
2132
2133         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2134                              &afinfo->seq_fops, afinfo);
2135         if (!p)
2136                 rc = -ENOMEM;
2137         return rc;
2138 }
2139
2140 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2141 {
2142         proc_net_remove(net, afinfo->name);
2143 }
2144
2145 static void get_openreq4(struct sock *sk, struct request_sock *req,
2146                          struct seq_file *f, int i, int uid, int *len)
2147 {
2148         const struct inet_request_sock *ireq = inet_rsk(req);
2149         int ttd = req->expires - jiffies;
2150
2151         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2152                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2153                 i,
2154                 ireq->loc_addr,
2155                 ntohs(inet_sk(sk)->sport),
2156                 ireq->rmt_addr,
2157                 ntohs(ireq->rmt_port),
2158                 TCP_SYN_RECV,
2159                 0, 0, /* could print option size, but that is af dependent. */
2160                 1,    /* timers active (only the expire timer) */
2161                 jiffies_to_clock_t(ttd),
2162                 req->retrans,
2163                 uid,
2164                 0,  /* non standard timer */
2165                 0, /* open_requests have no inode */
2166                 atomic_read(&sk->sk_refcnt),
2167                 req,
2168                 len);
2169 }
2170
2171 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2172 {
2173         int timer_active;
2174         unsigned long timer_expires;
2175         struct tcp_sock *tp = tcp_sk(sk);
2176         const struct inet_connection_sock *icsk = inet_csk(sk);
2177         struct inet_sock *inet = inet_sk(sk);
2178         __be32 dest = inet->daddr;
2179         __be32 src = inet->rcv_saddr;
2180         __u16 destp = ntohs(inet->dport);
2181         __u16 srcp = ntohs(inet->sport);
2182
2183         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2184                 timer_active    = 1;
2185                 timer_expires   = icsk->icsk_timeout;
2186         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2187                 timer_active    = 4;
2188                 timer_expires   = icsk->icsk_timeout;
2189         } else if (timer_pending(&sk->sk_timer)) {
2190                 timer_active    = 2;
2191                 timer_expires   = sk->sk_timer.expires;
2192         } else {
2193                 timer_active    = 0;
2194                 timer_expires = jiffies;
2195         }
2196
2197         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2198                         "%08X %5d %8d %lu %d %p %u %u %u %u %d%n",
2199                 i, src, srcp, dest, destp, sk->sk_state,
2200                 tp->write_seq - tp->snd_una,
2201                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2202                                              (tp->rcv_nxt - tp->copied_seq),
2203                 timer_active,
2204                 jiffies_to_clock_t(timer_expires - jiffies),
2205                 icsk->icsk_retransmits,
2206                 sock_i_uid(sk),
2207                 icsk->icsk_probes_out,
2208                 sock_i_ino(sk),
2209                 atomic_read(&sk->sk_refcnt), sk,
2210                 icsk->icsk_rto,
2211                 icsk->icsk_ack.ato,
2212                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2213                 tp->snd_cwnd,
2214                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2215                 len);
2216 }
2217
2218 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2219                                struct seq_file *f, int i, int *len)
2220 {
2221         __be32 dest, src;
2222         __u16 destp, srcp;
2223         int ttd = tw->tw_ttd - jiffies;
2224
2225         if (ttd < 0)
2226                 ttd = 0;
2227
2228         dest  = tw->tw_daddr;
2229         src   = tw->tw_rcv_saddr;
2230         destp = ntohs(tw->tw_dport);
2231         srcp  = ntohs(tw->tw_sport);
2232
2233         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2234                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2235                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2236                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2237                 atomic_read(&tw->tw_refcnt), tw, len);
2238 }
2239
2240 #define TMPSZ 150
2241
2242 static int tcp4_seq_show(struct seq_file *seq, void *v)
2243 {
2244         struct tcp_iter_state* st;
2245         int len;
2246
2247         if (v == SEQ_START_TOKEN) {
2248                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2249                            "  sl  local_address rem_address   st tx_queue "
2250                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2251                            "inode");
2252                 goto out;
2253         }
2254         st = seq->private;
2255
2256         switch (st->state) {
2257         case TCP_SEQ_STATE_LISTENING:
2258         case TCP_SEQ_STATE_ESTABLISHED:
2259                 get_tcp4_sock(v, seq, st->num, &len);
2260                 break;
2261         case TCP_SEQ_STATE_OPENREQ:
2262                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2263                 break;
2264         case TCP_SEQ_STATE_TIME_WAIT:
2265                 get_timewait4_sock(v, seq, st->num, &len);
2266                 break;
2267         }
2268         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2269 out:
2270         return 0;
2271 }
2272
2273 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2274         .name           = "tcp",
2275         .family         = AF_INET,
2276         .seq_fops       = {
2277                 .owner          = THIS_MODULE,
2278         },
2279         .seq_ops        = {
2280                 .show           = tcp4_seq_show,
2281         },
2282 };
2283
2284 static int tcp4_proc_init_net(struct net *net)
2285 {
2286         return tcp_proc_register(net, &tcp4_seq_afinfo);
2287 }
2288
2289 static void tcp4_proc_exit_net(struct net *net)
2290 {
2291         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2292 }
2293
2294 static struct pernet_operations tcp4_net_ops = {
2295         .init = tcp4_proc_init_net,
2296         .exit = tcp4_proc_exit_net,
2297 };
2298
2299 int __init tcp4_proc_init(void)
2300 {
2301         return register_pernet_subsys(&tcp4_net_ops);
2302 }
2303
2304 void tcp4_proc_exit(void)
2305 {
2306         unregister_pernet_subsys(&tcp4_net_ops);
2307 }
2308 #endif /* CONFIG_PROC_FS */
2309
2310 struct proto tcp_prot = {
2311         .name                   = "TCP",
2312         .owner                  = THIS_MODULE,
2313         .close                  = tcp_close,
2314         .connect                = tcp_v4_connect,
2315         .disconnect             = tcp_disconnect,
2316         .accept                 = inet_csk_accept,
2317         .ioctl                  = tcp_ioctl,
2318         .init                   = tcp_v4_init_sock,
2319         .destroy                = tcp_v4_destroy_sock,
2320         .shutdown               = tcp_shutdown,
2321         .setsockopt             = tcp_setsockopt,
2322         .getsockopt             = tcp_getsockopt,
2323         .recvmsg                = tcp_recvmsg,
2324         .backlog_rcv            = tcp_v4_do_rcv,
2325         .hash                   = inet_hash,
2326         .unhash                 = inet_unhash,
2327         .get_port               = inet_csk_get_port,
2328         .enter_memory_pressure  = tcp_enter_memory_pressure,
2329         .sockets_allocated      = &tcp_sockets_allocated,
2330         .orphan_count           = &tcp_orphan_count,
2331         .memory_allocated       = &tcp_memory_allocated,
2332         .memory_pressure        = &tcp_memory_pressure,
2333         .sysctl_mem             = sysctl_tcp_mem,
2334         .sysctl_wmem            = sysctl_tcp_wmem,
2335         .sysctl_rmem            = sysctl_tcp_rmem,
2336         .max_header             = MAX_TCP_HEADER,
2337         .obj_size               = sizeof(struct tcp_sock),
2338         .twsk_prot              = &tcp_timewait_sock_ops,
2339         .rsk_prot               = &tcp_request_sock_ops,
2340         .h.hashinfo             = &tcp_hashinfo,
2341 #ifdef CONFIG_COMPAT
2342         .compat_setsockopt      = compat_tcp_setsockopt,
2343         .compat_getsockopt      = compat_tcp_getsockopt,
2344 #endif
2345 };
2346
2347
2348 static int __net_init tcp_sk_init(struct net *net)
2349 {
2350         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2351                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2352 }
2353
2354 static void __net_exit tcp_sk_exit(struct net *net)
2355 {
2356         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2357 }
2358
2359 static struct pernet_operations __net_initdata tcp_sk_ops = {
2360        .init = tcp_sk_init,
2361        .exit = tcp_sk_exit,
2362 };
2363
2364 void __init tcp_v4_init(void)
2365 {
2366         if (register_pernet_device(&tcp_sk_ops))
2367                 panic("Failed to create the TCP control socket.\n");
2368 }
2369
2370 EXPORT_SYMBOL(ipv4_specific);
2371 EXPORT_SYMBOL(tcp_hashinfo);
2372 EXPORT_SYMBOL(tcp_prot);
2373 EXPORT_SYMBOL(tcp_v4_conn_request);
2374 EXPORT_SYMBOL(tcp_v4_connect);
2375 EXPORT_SYMBOL(tcp_v4_do_rcv);
2376 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2377 EXPORT_SYMBOL(tcp_v4_send_check);
2378 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2379
2380 #ifdef CONFIG_PROC_FS
2381 EXPORT_SYMBOL(tcp_proc_register);
2382 EXPORT_SYMBOL(tcp_proc_unregister);
2383 #endif
2384 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2385