Merge branch 'net-next-2.6-misc-20080612a' of git://git.linux-ipv6.org/gitroot/yoshfu...
[safe/jmp/linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/types.h>
55 #include <linux/fcntl.h>
56 #include <linux/module.h>
57 #include <linux/random.h>
58 #include <linux/cache.h>
59 #include <linux/jhash.h>
60 #include <linux/init.h>
61 #include <linux/times.h>
62
63 #include <net/net_namespace.h>
64 #include <net/icmp.h>
65 #include <net/inet_hashtables.h>
66 #include <net/tcp.h>
67 #include <net/transp_v6.h>
68 #include <net/ipv6.h>
69 #include <net/inet_common.h>
70 #include <net/timewait_sock.h>
71 #include <net/xfrm.h>
72 #include <net/netdma.h>
73
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79
80 #include <linux/crypto.h>
81 #include <linux/scatterlist.h>
82
83 int sysctl_tcp_tw_reuse __read_mostly;
84 int sysctl_tcp_low_latency __read_mostly;
85
86 /* Check TCP sequence numbers in ICMP packets. */
87 #define ICMP_MIN_LENGTH 8
88
89 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
90
91 #ifdef CONFIG_TCP_MD5SIG
92 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
93                                                    __be32 addr);
94 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
95                                    __be32 saddr, __be32 daddr,
96                                    struct tcphdr *th, unsigned int tcplen);
97 #else
98 static inline
99 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
100 {
101         return NULL;
102 }
103 #endif
104
105 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
106         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
107         .lhash_users = ATOMIC_INIT(0),
108         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
109 };
110
111 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
112 {
113         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
114                                           ip_hdr(skb)->saddr,
115                                           tcp_hdr(skb)->dest,
116                                           tcp_hdr(skb)->source);
117 }
118
119 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
120 {
121         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
122         struct tcp_sock *tp = tcp_sk(sk);
123
124         /* With PAWS, it is safe from the viewpoint
125            of data integrity. Even without PAWS it is safe provided sequence
126            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
127
128            Actually, the idea is close to VJ's one, only timestamp cache is
129            held not per host, but per port pair and TW bucket is used as state
130            holder.
131
132            If TW bucket has been already destroyed we fall back to VJ's scheme
133            and use initial timestamp retrieved from peer table.
134          */
135         if (tcptw->tw_ts_recent_stamp &&
136             (twp == NULL || (sysctl_tcp_tw_reuse &&
137                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
138                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
139                 if (tp->write_seq == 0)
140                         tp->write_seq = 1;
141                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
142                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
143                 sock_hold(sktw);
144                 return 1;
145         }
146
147         return 0;
148 }
149
150 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
151
152 /* This will initiate an outgoing connection. */
153 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
154 {
155         struct inet_sock *inet = inet_sk(sk);
156         struct tcp_sock *tp = tcp_sk(sk);
157         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
158         struct rtable *rt;
159         __be32 daddr, nexthop;
160         int tmp;
161         int err;
162
163         if (addr_len < sizeof(struct sockaddr_in))
164                 return -EINVAL;
165
166         if (usin->sin_family != AF_INET)
167                 return -EAFNOSUPPORT;
168
169         nexthop = daddr = usin->sin_addr.s_addr;
170         if (inet->opt && inet->opt->srr) {
171                 if (!daddr)
172                         return -EINVAL;
173                 nexthop = inet->opt->faddr;
174         }
175
176         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
177                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
178                                IPPROTO_TCP,
179                                inet->sport, usin->sin_port, sk, 1);
180         if (tmp < 0) {
181                 if (tmp == -ENETUNREACH)
182                         IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
183                 return tmp;
184         }
185
186         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187                 ip_rt_put(rt);
188                 return -ENETUNREACH;
189         }
190
191         if (!inet->opt || !inet->opt->srr)
192                 daddr = rt->rt_dst;
193
194         if (!inet->saddr)
195                 inet->saddr = rt->rt_src;
196         inet->rcv_saddr = inet->saddr;
197
198         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
199                 /* Reset inherited state */
200                 tp->rx_opt.ts_recent       = 0;
201                 tp->rx_opt.ts_recent_stamp = 0;
202                 tp->write_seq              = 0;
203         }
204
205         if (tcp_death_row.sysctl_tw_recycle &&
206             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
207                 struct inet_peer *peer = rt_get_peer(rt);
208                 /*
209                  * VJ's idea. We save last timestamp seen from
210                  * the destination in peer table, when entering state
211                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
212                  * when trying new connection.
213                  */
214                 if (peer != NULL &&
215                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
216                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
217                         tp->rx_opt.ts_recent = peer->tcp_ts;
218                 }
219         }
220
221         inet->dport = usin->sin_port;
222         inet->daddr = daddr;
223
224         inet_csk(sk)->icsk_ext_hdr_len = 0;
225         if (inet->opt)
226                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
227
228         tp->rx_opt.mss_clamp = 536;
229
230         /* Socket identity is still unknown (sport may be zero).
231          * However we set state to SYN-SENT and not releasing socket
232          * lock select source port, enter ourselves into the hash tables and
233          * complete initialization after this.
234          */
235         tcp_set_state(sk, TCP_SYN_SENT);
236         err = inet_hash_connect(&tcp_death_row, sk);
237         if (err)
238                 goto failure;
239
240         err = ip_route_newports(&rt, IPPROTO_TCP,
241                                 inet->sport, inet->dport, sk);
242         if (err)
243                 goto failure;
244
245         /* OK, now commit destination to socket.  */
246         sk->sk_gso_type = SKB_GSO_TCPV4;
247         sk_setup_caps(sk, &rt->u.dst);
248
249         if (!tp->write_seq)
250                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
251                                                            inet->daddr,
252                                                            inet->sport,
253                                                            usin->sin_port);
254
255         inet->id = tp->write_seq ^ jiffies;
256
257         err = tcp_connect(sk);
258         rt = NULL;
259         if (err)
260                 goto failure;
261
262         return 0;
263
264 failure:
265         /*
266          * This unhashes the socket and releases the local port,
267          * if necessary.
268          */
269         tcp_set_state(sk, TCP_CLOSE);
270         ip_rt_put(rt);
271         sk->sk_route_caps = 0;
272         inet->dport = 0;
273         return err;
274 }
275
276 /*
277  * This routine does path mtu discovery as defined in RFC1191.
278  */
279 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
280 {
281         struct dst_entry *dst;
282         struct inet_sock *inet = inet_sk(sk);
283
284         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
285          * send out by Linux are always <576bytes so they should go through
286          * unfragmented).
287          */
288         if (sk->sk_state == TCP_LISTEN)
289                 return;
290
291         /* We don't check in the destentry if pmtu discovery is forbidden
292          * on this route. We just assume that no packet_to_big packets
293          * are send back when pmtu discovery is not active.
294          * There is a small race when the user changes this flag in the
295          * route, but I think that's acceptable.
296          */
297         if ((dst = __sk_dst_check(sk, 0)) == NULL)
298                 return;
299
300         dst->ops->update_pmtu(dst, mtu);
301
302         /* Something is about to be wrong... Remember soft error
303          * for the case, if this connection will not able to recover.
304          */
305         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
306                 sk->sk_err_soft = EMSGSIZE;
307
308         mtu = dst_mtu(dst);
309
310         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
311             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
312                 tcp_sync_mss(sk, mtu);
313
314                 /* Resend the TCP packet because it's
315                  * clear that the old packet has been
316                  * dropped. This is the new "fast" path mtu
317                  * discovery.
318                  */
319                 tcp_simple_retransmit(sk);
320         } /* else let the usual retransmit timer handle it */
321 }
322
323 /*
324  * This routine is called by the ICMP module when it gets some
325  * sort of error condition.  If err < 0 then the socket should
326  * be closed and the error returned to the user.  If err > 0
327  * it's just the icmp type << 8 | icmp code.  After adjustment
328  * header points to the first 8 bytes of the tcp header.  We need
329  * to find the appropriate port.
330  *
331  * The locking strategy used here is very "optimistic". When
332  * someone else accesses the socket the ICMP is just dropped
333  * and for some paths there is no check at all.
334  * A more general error queue to queue errors for later handling
335  * is probably better.
336  *
337  */
338
339 void tcp_v4_err(struct sk_buff *skb, u32 info)
340 {
341         struct iphdr *iph = (struct iphdr *)skb->data;
342         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
343         struct tcp_sock *tp;
344         struct inet_sock *inet;
345         const int type = icmp_hdr(skb)->type;
346         const int code = icmp_hdr(skb)->code;
347         struct sock *sk;
348         __u32 seq;
349         int err;
350
351         if (skb->len < (iph->ihl << 2) + 8) {
352                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
353                 return;
354         }
355
356         sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest,
357                         iph->saddr, th->source, inet_iif(skb));
358         if (!sk) {
359                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
360                 return;
361         }
362         if (sk->sk_state == TCP_TIME_WAIT) {
363                 inet_twsk_put(inet_twsk(sk));
364                 return;
365         }
366
367         bh_lock_sock(sk);
368         /* If too many ICMPs get dropped on busy
369          * servers this needs to be solved differently.
370          */
371         if (sock_owned_by_user(sk))
372                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
373
374         if (sk->sk_state == TCP_CLOSE)
375                 goto out;
376
377         tp = tcp_sk(sk);
378         seq = ntohl(th->seq);
379         if (sk->sk_state != TCP_LISTEN &&
380             !between(seq, tp->snd_una, tp->snd_nxt)) {
381                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
382                 goto out;
383         }
384
385         switch (type) {
386         case ICMP_SOURCE_QUENCH:
387                 /* Just silently ignore these. */
388                 goto out;
389         case ICMP_PARAMETERPROB:
390                 err = EPROTO;
391                 break;
392         case ICMP_DEST_UNREACH:
393                 if (code > NR_ICMP_UNREACH)
394                         goto out;
395
396                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
397                         if (!sock_owned_by_user(sk))
398                                 do_pmtu_discovery(sk, iph, info);
399                         goto out;
400                 }
401
402                 err = icmp_err_convert[code].errno;
403                 break;
404         case ICMP_TIME_EXCEEDED:
405                 err = EHOSTUNREACH;
406                 break;
407         default:
408                 goto out;
409         }
410
411         switch (sk->sk_state) {
412                 struct request_sock *req, **prev;
413         case TCP_LISTEN:
414                 if (sock_owned_by_user(sk))
415                         goto out;
416
417                 req = inet_csk_search_req(sk, &prev, th->dest,
418                                           iph->daddr, iph->saddr);
419                 if (!req)
420                         goto out;
421
422                 /* ICMPs are not backlogged, hence we cannot get
423                    an established socket here.
424                  */
425                 BUG_TRAP(!req->sk);
426
427                 if (seq != tcp_rsk(req)->snt_isn) {
428                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
429                         goto out;
430                 }
431
432                 /*
433                  * Still in SYN_RECV, just remove it silently.
434                  * There is no good way to pass the error to the newly
435                  * created socket, and POSIX does not want network
436                  * errors returned from accept().
437                  */
438                 inet_csk_reqsk_queue_drop(sk, req, prev);
439                 goto out;
440
441         case TCP_SYN_SENT:
442         case TCP_SYN_RECV:  /* Cannot happen.
443                                It can f.e. if SYNs crossed.
444                              */
445                 if (!sock_owned_by_user(sk)) {
446                         sk->sk_err = err;
447
448                         sk->sk_error_report(sk);
449
450                         tcp_done(sk);
451                 } else {
452                         sk->sk_err_soft = err;
453                 }
454                 goto out;
455         }
456
457         /* If we've already connected we will keep trying
458          * until we time out, or the user gives up.
459          *
460          * rfc1122 4.2.3.9 allows to consider as hard errors
461          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
462          * but it is obsoleted by pmtu discovery).
463          *
464          * Note, that in modern internet, where routing is unreliable
465          * and in each dark corner broken firewalls sit, sending random
466          * errors ordered by their masters even this two messages finally lose
467          * their original sense (even Linux sends invalid PORT_UNREACHs)
468          *
469          * Now we are in compliance with RFCs.
470          *                                                      --ANK (980905)
471          */
472
473         inet = inet_sk(sk);
474         if (!sock_owned_by_user(sk) && inet->recverr) {
475                 sk->sk_err = err;
476                 sk->sk_error_report(sk);
477         } else  { /* Only an error on timeout */
478                 sk->sk_err_soft = err;
479         }
480
481 out:
482         bh_unlock_sock(sk);
483         sock_put(sk);
484 }
485
486 /* This routine computes an IPv4 TCP checksum. */
487 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
488 {
489         struct inet_sock *inet = inet_sk(sk);
490         struct tcphdr *th = tcp_hdr(skb);
491
492         if (skb->ip_summed == CHECKSUM_PARTIAL) {
493                 th->check = ~tcp_v4_check(len, inet->saddr,
494                                           inet->daddr, 0);
495                 skb->csum_start = skb_transport_header(skb) - skb->head;
496                 skb->csum_offset = offsetof(struct tcphdr, check);
497         } else {
498                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
499                                          csum_partial((char *)th,
500                                                       th->doff << 2,
501                                                       skb->csum));
502         }
503 }
504
505 int tcp_v4_gso_send_check(struct sk_buff *skb)
506 {
507         const struct iphdr *iph;
508         struct tcphdr *th;
509
510         if (!pskb_may_pull(skb, sizeof(*th)))
511                 return -EINVAL;
512
513         iph = ip_hdr(skb);
514         th = tcp_hdr(skb);
515
516         th->check = 0;
517         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
518         skb->csum_start = skb_transport_header(skb) - skb->head;
519         skb->csum_offset = offsetof(struct tcphdr, check);
520         skb->ip_summed = CHECKSUM_PARTIAL;
521         return 0;
522 }
523
524 /*
525  *      This routine will send an RST to the other tcp.
526  *
527  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
528  *                    for reset.
529  *      Answer: if a packet caused RST, it is not for a socket
530  *              existing in our system, if it is matched to a socket,
531  *              it is just duplicate segment or bug in other side's TCP.
532  *              So that we build reply only basing on parameters
533  *              arrived with segment.
534  *      Exception: precedence violation. We do not implement it in any case.
535  */
536
537 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
538 {
539         struct tcphdr *th = tcp_hdr(skb);
540         struct {
541                 struct tcphdr th;
542 #ifdef CONFIG_TCP_MD5SIG
543                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
544 #endif
545         } rep;
546         struct ip_reply_arg arg;
547 #ifdef CONFIG_TCP_MD5SIG
548         struct tcp_md5sig_key *key;
549 #endif
550
551         /* Never send a reset in response to a reset. */
552         if (th->rst)
553                 return;
554
555         if (skb->rtable->rt_type != RTN_LOCAL)
556                 return;
557
558         /* Swap the send and the receive. */
559         memset(&rep, 0, sizeof(rep));
560         rep.th.dest   = th->source;
561         rep.th.source = th->dest;
562         rep.th.doff   = sizeof(struct tcphdr) / 4;
563         rep.th.rst    = 1;
564
565         if (th->ack) {
566                 rep.th.seq = th->ack_seq;
567         } else {
568                 rep.th.ack = 1;
569                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
570                                        skb->len - (th->doff << 2));
571         }
572
573         memset(&arg, 0, sizeof(arg));
574         arg.iov[0].iov_base = (unsigned char *)&rep;
575         arg.iov[0].iov_len  = sizeof(rep.th);
576
577 #ifdef CONFIG_TCP_MD5SIG
578         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
579         if (key) {
580                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
581                                    (TCPOPT_NOP << 16) |
582                                    (TCPOPT_MD5SIG << 8) |
583                                    TCPOLEN_MD5SIG);
584                 /* Update length and the length the header thinks exists */
585                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
586                 rep.th.doff = arg.iov[0].iov_len / 4;
587
588                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
589                                         key,
590                                         ip_hdr(skb)->daddr,
591                                         ip_hdr(skb)->saddr,
592                                         &rep.th, arg.iov[0].iov_len);
593         }
594 #endif
595         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
596                                       ip_hdr(skb)->saddr, /* XXX */
597                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
598         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
599
600         ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb,
601                       &arg, arg.iov[0].iov_len);
602
603         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
604         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
605 }
606
607 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
608    outside socket context is ugly, certainly. What can I do?
609  */
610
611 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
612                             u32 win, u32 ts, int oif,
613                             struct tcp_md5sig_key *key)
614 {
615         struct tcphdr *th = tcp_hdr(skb);
616         struct {
617                 struct tcphdr th;
618                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
619 #ifdef CONFIG_TCP_MD5SIG
620                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
621 #endif
622                         ];
623         } rep;
624         struct ip_reply_arg arg;
625
626         memset(&rep.th, 0, sizeof(struct tcphdr));
627         memset(&arg, 0, sizeof(arg));
628
629         arg.iov[0].iov_base = (unsigned char *)&rep;
630         arg.iov[0].iov_len  = sizeof(rep.th);
631         if (ts) {
632                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
633                                    (TCPOPT_TIMESTAMP << 8) |
634                                    TCPOLEN_TIMESTAMP);
635                 rep.opt[1] = htonl(tcp_time_stamp);
636                 rep.opt[2] = htonl(ts);
637                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
638         }
639
640         /* Swap the send and the receive. */
641         rep.th.dest    = th->source;
642         rep.th.source  = th->dest;
643         rep.th.doff    = arg.iov[0].iov_len / 4;
644         rep.th.seq     = htonl(seq);
645         rep.th.ack_seq = htonl(ack);
646         rep.th.ack     = 1;
647         rep.th.window  = htons(win);
648
649 #ifdef CONFIG_TCP_MD5SIG
650         if (key) {
651                 int offset = (ts) ? 3 : 0;
652
653                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
654                                           (TCPOPT_NOP << 16) |
655                                           (TCPOPT_MD5SIG << 8) |
656                                           TCPOLEN_MD5SIG);
657                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
658                 rep.th.doff = arg.iov[0].iov_len/4;
659
660                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
661                                         key,
662                                         ip_hdr(skb)->daddr,
663                                         ip_hdr(skb)->saddr,
664                                         &rep.th, arg.iov[0].iov_len);
665         }
666 #endif
667         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
668                                       ip_hdr(skb)->saddr, /* XXX */
669                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
670         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
671         if (oif)
672                 arg.bound_dev_if = oif;
673
674         ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb,
675                       &arg, arg.iov[0].iov_len);
676
677         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
678 }
679
680 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
681 {
682         struct inet_timewait_sock *tw = inet_twsk(sk);
683         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
684
685         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
686                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
687                         tcptw->tw_ts_recent,
688                         tw->tw_bound_dev_if,
689                         tcp_twsk_md5_key(tcptw)
690                         );
691
692         inet_twsk_put(tw);
693 }
694
695 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
696                                   struct request_sock *req)
697 {
698         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
699                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
700                         req->ts_recent,
701                         0,
702                         tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr));
703 }
704
705 /*
706  *      Send a SYN-ACK after having received a SYN.
707  *      This still operates on a request_sock only, not on a big
708  *      socket.
709  */
710 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
711                                 struct dst_entry *dst)
712 {
713         const struct inet_request_sock *ireq = inet_rsk(req);
714         int err = -1;
715         struct sk_buff * skb;
716
717         /* First, grab a route. */
718         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
719                 return -1;
720
721         skb = tcp_make_synack(sk, dst, req);
722
723         if (skb) {
724                 struct tcphdr *th = tcp_hdr(skb);
725
726                 th->check = tcp_v4_check(skb->len,
727                                          ireq->loc_addr,
728                                          ireq->rmt_addr,
729                                          csum_partial((char *)th, skb->len,
730                                                       skb->csum));
731
732                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
733                                             ireq->rmt_addr,
734                                             ireq->opt);
735                 err = net_xmit_eval(err);
736         }
737
738         dst_release(dst);
739         return err;
740 }
741
742 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
743 {
744         return __tcp_v4_send_synack(sk, req, NULL);
745 }
746
747 /*
748  *      IPv4 request_sock destructor.
749  */
750 static void tcp_v4_reqsk_destructor(struct request_sock *req)
751 {
752         kfree(inet_rsk(req)->opt);
753 }
754
755 #ifdef CONFIG_SYN_COOKIES
756 static void syn_flood_warning(struct sk_buff *skb)
757 {
758         static unsigned long warntime;
759
760         if (time_after(jiffies, (warntime + HZ * 60))) {
761                 warntime = jiffies;
762                 printk(KERN_INFO
763                        "possible SYN flooding on port %d. Sending cookies.\n",
764                        ntohs(tcp_hdr(skb)->dest));
765         }
766 }
767 #endif
768
769 /*
770  * Save and compile IPv4 options into the request_sock if needed.
771  */
772 static struct ip_options *tcp_v4_save_options(struct sock *sk,
773                                               struct sk_buff *skb)
774 {
775         struct ip_options *opt = &(IPCB(skb)->opt);
776         struct ip_options *dopt = NULL;
777
778         if (opt && opt->optlen) {
779                 int opt_size = optlength(opt);
780                 dopt = kmalloc(opt_size, GFP_ATOMIC);
781                 if (dopt) {
782                         if (ip_options_echo(dopt, skb)) {
783                                 kfree(dopt);
784                                 dopt = NULL;
785                         }
786                 }
787         }
788         return dopt;
789 }
790
791 #ifdef CONFIG_TCP_MD5SIG
792 /*
793  * RFC2385 MD5 checksumming requires a mapping of
794  * IP address->MD5 Key.
795  * We need to maintain these in the sk structure.
796  */
797
798 /* Find the Key structure for an address.  */
799 static struct tcp_md5sig_key *
800                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
801 {
802         struct tcp_sock *tp = tcp_sk(sk);
803         int i;
804
805         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
806                 return NULL;
807         for (i = 0; i < tp->md5sig_info->entries4; i++) {
808                 if (tp->md5sig_info->keys4[i].addr == addr)
809                         return &tp->md5sig_info->keys4[i].base;
810         }
811         return NULL;
812 }
813
814 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
815                                          struct sock *addr_sk)
816 {
817         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
818 }
819
820 EXPORT_SYMBOL(tcp_v4_md5_lookup);
821
822 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
823                                                       struct request_sock *req)
824 {
825         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
826 }
827
828 /* This can be called on a newly created socket, from other files */
829 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
830                       u8 *newkey, u8 newkeylen)
831 {
832         /* Add Key to the list */
833         struct tcp_md5sig_key *key;
834         struct tcp_sock *tp = tcp_sk(sk);
835         struct tcp4_md5sig_key *keys;
836
837         key = tcp_v4_md5_do_lookup(sk, addr);
838         if (key) {
839                 /* Pre-existing entry - just update that one. */
840                 kfree(key->key);
841                 key->key = newkey;
842                 key->keylen = newkeylen;
843         } else {
844                 struct tcp_md5sig_info *md5sig;
845
846                 if (!tp->md5sig_info) {
847                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
848                                                   GFP_ATOMIC);
849                         if (!tp->md5sig_info) {
850                                 kfree(newkey);
851                                 return -ENOMEM;
852                         }
853                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
854                 }
855                 if (tcp_alloc_md5sig_pool() == NULL) {
856                         kfree(newkey);
857                         return -ENOMEM;
858                 }
859                 md5sig = tp->md5sig_info;
860
861                 if (md5sig->alloced4 == md5sig->entries4) {
862                         keys = kmalloc((sizeof(*keys) *
863                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
864                         if (!keys) {
865                                 kfree(newkey);
866                                 tcp_free_md5sig_pool();
867                                 return -ENOMEM;
868                         }
869
870                         if (md5sig->entries4)
871                                 memcpy(keys, md5sig->keys4,
872                                        sizeof(*keys) * md5sig->entries4);
873
874                         /* Free old key list, and reference new one */
875                         kfree(md5sig->keys4);
876                         md5sig->keys4 = keys;
877                         md5sig->alloced4++;
878                 }
879                 md5sig->entries4++;
880                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
881                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
882                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
883         }
884         return 0;
885 }
886
887 EXPORT_SYMBOL(tcp_v4_md5_do_add);
888
889 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
890                                u8 *newkey, u8 newkeylen)
891 {
892         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
893                                  newkey, newkeylen);
894 }
895
896 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
897 {
898         struct tcp_sock *tp = tcp_sk(sk);
899         int i;
900
901         for (i = 0; i < tp->md5sig_info->entries4; i++) {
902                 if (tp->md5sig_info->keys4[i].addr == addr) {
903                         /* Free the key */
904                         kfree(tp->md5sig_info->keys4[i].base.key);
905                         tp->md5sig_info->entries4--;
906
907                         if (tp->md5sig_info->entries4 == 0) {
908                                 kfree(tp->md5sig_info->keys4);
909                                 tp->md5sig_info->keys4 = NULL;
910                                 tp->md5sig_info->alloced4 = 0;
911                         } else if (tp->md5sig_info->entries4 != i) {
912                                 /* Need to do some manipulation */
913                                 memmove(&tp->md5sig_info->keys4[i],
914                                         &tp->md5sig_info->keys4[i+1],
915                                         (tp->md5sig_info->entries4 - i) *
916                                          sizeof(struct tcp4_md5sig_key));
917                         }
918                         tcp_free_md5sig_pool();
919                         return 0;
920                 }
921         }
922         return -ENOENT;
923 }
924
925 EXPORT_SYMBOL(tcp_v4_md5_do_del);
926
927 static void tcp_v4_clear_md5_list(struct sock *sk)
928 {
929         struct tcp_sock *tp = tcp_sk(sk);
930
931         /* Free each key, then the set of key keys,
932          * the crypto element, and then decrement our
933          * hold on the last resort crypto.
934          */
935         if (tp->md5sig_info->entries4) {
936                 int i;
937                 for (i = 0; i < tp->md5sig_info->entries4; i++)
938                         kfree(tp->md5sig_info->keys4[i].base.key);
939                 tp->md5sig_info->entries4 = 0;
940                 tcp_free_md5sig_pool();
941         }
942         if (tp->md5sig_info->keys4) {
943                 kfree(tp->md5sig_info->keys4);
944                 tp->md5sig_info->keys4 = NULL;
945                 tp->md5sig_info->alloced4  = 0;
946         }
947 }
948
949 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
950                                  int optlen)
951 {
952         struct tcp_md5sig cmd;
953         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
954         u8 *newkey;
955
956         if (optlen < sizeof(cmd))
957                 return -EINVAL;
958
959         if (copy_from_user(&cmd, optval, sizeof(cmd)))
960                 return -EFAULT;
961
962         if (sin->sin_family != AF_INET)
963                 return -EINVAL;
964
965         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
966                 if (!tcp_sk(sk)->md5sig_info)
967                         return -ENOENT;
968                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
969         }
970
971         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
972                 return -EINVAL;
973
974         if (!tcp_sk(sk)->md5sig_info) {
975                 struct tcp_sock *tp = tcp_sk(sk);
976                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
977
978                 if (!p)
979                         return -EINVAL;
980
981                 tp->md5sig_info = p;
982                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
983         }
984
985         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
986         if (!newkey)
987                 return -ENOMEM;
988         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
989                                  newkey, cmd.tcpm_keylen);
990 }
991
992 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
993                                    __be32 saddr, __be32 daddr,
994                                    struct tcphdr *th,
995                                    unsigned int tcplen)
996 {
997         struct tcp_md5sig_pool *hp;
998         struct tcp4_pseudohdr *bp;
999         int err;
1000
1001         /*
1002          * Okay, so RFC2385 is turned on for this connection,
1003          * so we need to generate the MD5 hash for the packet now.
1004          */
1005
1006         hp = tcp_get_md5sig_pool();
1007         if (!hp)
1008                 goto clear_hash_noput;
1009
1010         bp = &hp->md5_blk.ip4;
1011
1012         /*
1013          * The TCP pseudo-header (in the order: source IP address,
1014          * destination IP address, zero-padded protocol number, and
1015          * segment length)
1016          */
1017         bp->saddr = saddr;
1018         bp->daddr = daddr;
1019         bp->pad = 0;
1020         bp->protocol = IPPROTO_TCP;
1021         bp->len = htons(tcplen);
1022
1023         err = tcp_calc_md5_hash(md5_hash, key, sizeof(*bp),
1024                                 th, tcplen, hp);
1025         if (err)
1026                 goto clear_hash;
1027
1028         /* Free up the crypto pool */
1029         tcp_put_md5sig_pool();
1030 out:
1031         return 0;
1032 clear_hash:
1033         tcp_put_md5sig_pool();
1034 clear_hash_noput:
1035         memset(md5_hash, 0, 16);
1036         goto out;
1037 }
1038
1039 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1040                          struct sock *sk,
1041                          struct dst_entry *dst,
1042                          struct request_sock *req,
1043                          struct tcphdr *th,
1044                          unsigned int tcplen)
1045 {
1046         __be32 saddr, daddr;
1047
1048         if (sk) {
1049                 saddr = inet_sk(sk)->saddr;
1050                 daddr = inet_sk(sk)->daddr;
1051         } else {
1052                 struct rtable *rt = (struct rtable *)dst;
1053                 BUG_ON(!rt);
1054                 saddr = rt->rt_src;
1055                 daddr = rt->rt_dst;
1056         }
1057         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1058                                        saddr, daddr,
1059                                        th, tcplen);
1060 }
1061
1062 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1063
1064 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1065 {
1066         /*
1067          * This gets called for each TCP segment that arrives
1068          * so we want to be efficient.
1069          * We have 3 drop cases:
1070          * o No MD5 hash and one expected.
1071          * o MD5 hash and we're not expecting one.
1072          * o MD5 hash and its wrong.
1073          */
1074         __u8 *hash_location = NULL;
1075         struct tcp_md5sig_key *hash_expected;
1076         const struct iphdr *iph = ip_hdr(skb);
1077         struct tcphdr *th = tcp_hdr(skb);
1078         int genhash;
1079         unsigned char newhash[16];
1080
1081         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1082         hash_location = tcp_parse_md5sig_option(th);
1083
1084         /* We've parsed the options - do we have a hash? */
1085         if (!hash_expected && !hash_location)
1086                 return 0;
1087
1088         if (hash_expected && !hash_location) {
1089                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1090                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1091                                NIPQUAD(iph->saddr), ntohs(th->source),
1092                                NIPQUAD(iph->daddr), ntohs(th->dest));
1093                 return 1;
1094         }
1095
1096         if (!hash_expected && hash_location) {
1097                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1098                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1099                                NIPQUAD(iph->saddr), ntohs(th->source),
1100                                NIPQUAD(iph->daddr), ntohs(th->dest));
1101                 return 1;
1102         }
1103
1104         /* Okay, so this is hash_expected and hash_location -
1105          * so we need to calculate the checksum.
1106          */
1107         genhash = tcp_v4_do_calc_md5_hash(newhash,
1108                                           hash_expected,
1109                                           iph->saddr, iph->daddr,
1110                                           th, skb->len);
1111
1112         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1113                 if (net_ratelimit()) {
1114                         printk(KERN_INFO "MD5 Hash failed for "
1115                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1116                                NIPQUAD(iph->saddr), ntohs(th->source),
1117                                NIPQUAD(iph->daddr), ntohs(th->dest),
1118                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1119                 }
1120                 return 1;
1121         }
1122         return 0;
1123 }
1124
1125 #endif
1126
1127 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1128         .family         =       PF_INET,
1129         .obj_size       =       sizeof(struct tcp_request_sock),
1130         .rtx_syn_ack    =       tcp_v4_send_synack,
1131         .send_ack       =       tcp_v4_reqsk_send_ack,
1132         .destructor     =       tcp_v4_reqsk_destructor,
1133         .send_reset     =       tcp_v4_send_reset,
1134 };
1135
1136 #ifdef CONFIG_TCP_MD5SIG
1137 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1138         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1139 };
1140 #endif
1141
1142 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1143         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1144         .twsk_unique    = tcp_twsk_unique,
1145         .twsk_destructor= tcp_twsk_destructor,
1146 };
1147
1148 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1149 {
1150         struct inet_request_sock *ireq;
1151         struct tcp_options_received tmp_opt;
1152         struct request_sock *req;
1153         __be32 saddr = ip_hdr(skb)->saddr;
1154         __be32 daddr = ip_hdr(skb)->daddr;
1155         __u32 isn = TCP_SKB_CB(skb)->when;
1156         struct dst_entry *dst = NULL;
1157 #ifdef CONFIG_SYN_COOKIES
1158         int want_cookie = 0;
1159 #else
1160 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1161 #endif
1162
1163         /* Never answer to SYNs send to broadcast or multicast */
1164         if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1165                 goto drop;
1166
1167         /* TW buckets are converted to open requests without
1168          * limitations, they conserve resources and peer is
1169          * evidently real one.
1170          */
1171         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1172 #ifdef CONFIG_SYN_COOKIES
1173                 if (sysctl_tcp_syncookies) {
1174                         want_cookie = 1;
1175                 } else
1176 #endif
1177                 goto drop;
1178         }
1179
1180         /* Accept backlog is full. If we have already queued enough
1181          * of warm entries in syn queue, drop request. It is better than
1182          * clogging syn queue with openreqs with exponentially increasing
1183          * timeout.
1184          */
1185         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1186                 goto drop;
1187
1188         req = reqsk_alloc(&tcp_request_sock_ops);
1189         if (!req)
1190                 goto drop;
1191
1192 #ifdef CONFIG_TCP_MD5SIG
1193         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1194 #endif
1195
1196         tcp_clear_options(&tmp_opt);
1197         tmp_opt.mss_clamp = 536;
1198         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1199
1200         tcp_parse_options(skb, &tmp_opt, 0);
1201
1202         if (want_cookie && !tmp_opt.saw_tstamp)
1203                 tcp_clear_options(&tmp_opt);
1204
1205         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1206                 /* Some OSes (unknown ones, but I see them on web server, which
1207                  * contains information interesting only for windows'
1208                  * users) do not send their stamp in SYN. It is easy case.
1209                  * We simply do not advertise TS support.
1210                  */
1211                 tmp_opt.saw_tstamp = 0;
1212                 tmp_opt.tstamp_ok  = 0;
1213         }
1214         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1215
1216         tcp_openreq_init(req, &tmp_opt, skb);
1217
1218         if (security_inet_conn_request(sk, skb, req))
1219                 goto drop_and_free;
1220
1221         ireq = inet_rsk(req);
1222         ireq->loc_addr = daddr;
1223         ireq->rmt_addr = saddr;
1224         ireq->opt = tcp_v4_save_options(sk, skb);
1225         if (!want_cookie)
1226                 TCP_ECN_create_request(req, tcp_hdr(skb));
1227
1228         if (want_cookie) {
1229 #ifdef CONFIG_SYN_COOKIES
1230                 syn_flood_warning(skb);
1231                 req->cookie_ts = tmp_opt.tstamp_ok;
1232 #endif
1233                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1234         } else if (!isn) {
1235                 struct inet_peer *peer = NULL;
1236
1237                 /* VJ's idea. We save last timestamp seen
1238                  * from the destination in peer table, when entering
1239                  * state TIME-WAIT, and check against it before
1240                  * accepting new connection request.
1241                  *
1242                  * If "isn" is not zero, this request hit alive
1243                  * timewait bucket, so that all the necessary checks
1244                  * are made in the function processing timewait state.
1245                  */
1246                 if (tmp_opt.saw_tstamp &&
1247                     tcp_death_row.sysctl_tw_recycle &&
1248                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1249                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1250                     peer->v4daddr == saddr) {
1251                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1252                             (s32)(peer->tcp_ts - req->ts_recent) >
1253                                                         TCP_PAWS_WINDOW) {
1254                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1255                                 goto drop_and_release;
1256                         }
1257                 }
1258                 /* Kill the following clause, if you dislike this way. */
1259                 else if (!sysctl_tcp_syncookies &&
1260                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1261                           (sysctl_max_syn_backlog >> 2)) &&
1262                          (!peer || !peer->tcp_ts_stamp) &&
1263                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1264                         /* Without syncookies last quarter of
1265                          * backlog is filled with destinations,
1266                          * proven to be alive.
1267                          * It means that we continue to communicate
1268                          * to destinations, already remembered
1269                          * to the moment of synflood.
1270                          */
1271                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1272                                        "request from " NIPQUAD_FMT "/%u\n",
1273                                        NIPQUAD(saddr),
1274                                        ntohs(tcp_hdr(skb)->source));
1275                         goto drop_and_release;
1276                 }
1277
1278                 isn = tcp_v4_init_sequence(skb);
1279         }
1280         tcp_rsk(req)->snt_isn = isn;
1281
1282         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1283                 goto drop_and_free;
1284
1285         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1286         return 0;
1287
1288 drop_and_release:
1289         dst_release(dst);
1290 drop_and_free:
1291         reqsk_free(req);
1292 drop:
1293         return 0;
1294 }
1295
1296
1297 /*
1298  * The three way handshake has completed - we got a valid synack -
1299  * now create the new socket.
1300  */
1301 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1302                                   struct request_sock *req,
1303                                   struct dst_entry *dst)
1304 {
1305         struct inet_request_sock *ireq;
1306         struct inet_sock *newinet;
1307         struct tcp_sock *newtp;
1308         struct sock *newsk;
1309 #ifdef CONFIG_TCP_MD5SIG
1310         struct tcp_md5sig_key *key;
1311 #endif
1312
1313         if (sk_acceptq_is_full(sk))
1314                 goto exit_overflow;
1315
1316         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1317                 goto exit;
1318
1319         newsk = tcp_create_openreq_child(sk, req, skb);
1320         if (!newsk)
1321                 goto exit;
1322
1323         newsk->sk_gso_type = SKB_GSO_TCPV4;
1324         sk_setup_caps(newsk, dst);
1325
1326         newtp                 = tcp_sk(newsk);
1327         newinet               = inet_sk(newsk);
1328         ireq                  = inet_rsk(req);
1329         newinet->daddr        = ireq->rmt_addr;
1330         newinet->rcv_saddr    = ireq->loc_addr;
1331         newinet->saddr        = ireq->loc_addr;
1332         newinet->opt          = ireq->opt;
1333         ireq->opt             = NULL;
1334         newinet->mc_index     = inet_iif(skb);
1335         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1336         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1337         if (newinet->opt)
1338                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1339         newinet->id = newtp->write_seq ^ jiffies;
1340
1341         tcp_mtup_init(newsk);
1342         tcp_sync_mss(newsk, dst_mtu(dst));
1343         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1344         tcp_initialize_rcv_mss(newsk);
1345
1346 #ifdef CONFIG_TCP_MD5SIG
1347         /* Copy over the MD5 key from the original socket */
1348         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1349                 /*
1350                  * We're using one, so create a matching key
1351                  * on the newsk structure. If we fail to get
1352                  * memory, then we end up not copying the key
1353                  * across. Shucks.
1354                  */
1355                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1356                 if (newkey != NULL)
1357                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1358                                           newkey, key->keylen);
1359         }
1360 #endif
1361
1362         __inet_hash_nolisten(newsk);
1363         __inet_inherit_port(sk, newsk);
1364
1365         return newsk;
1366
1367 exit_overflow:
1368         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1369 exit:
1370         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1371         dst_release(dst);
1372         return NULL;
1373 }
1374
1375 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1376 {
1377         struct tcphdr *th = tcp_hdr(skb);
1378         const struct iphdr *iph = ip_hdr(skb);
1379         struct sock *nsk;
1380         struct request_sock **prev;
1381         /* Find possible connection requests. */
1382         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1383                                                        iph->saddr, iph->daddr);
1384         if (req)
1385                 return tcp_check_req(sk, skb, req, prev);
1386
1387         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1388                         th->source, iph->daddr, th->dest, inet_iif(skb));
1389
1390         if (nsk) {
1391                 if (nsk->sk_state != TCP_TIME_WAIT) {
1392                         bh_lock_sock(nsk);
1393                         return nsk;
1394                 }
1395                 inet_twsk_put(inet_twsk(nsk));
1396                 return NULL;
1397         }
1398
1399 #ifdef CONFIG_SYN_COOKIES
1400         if (!th->rst && !th->syn && th->ack)
1401                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1402 #endif
1403         return sk;
1404 }
1405
1406 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1407 {
1408         const struct iphdr *iph = ip_hdr(skb);
1409
1410         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1411                 if (!tcp_v4_check(skb->len, iph->saddr,
1412                                   iph->daddr, skb->csum)) {
1413                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1414                         return 0;
1415                 }
1416         }
1417
1418         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1419                                        skb->len, IPPROTO_TCP, 0);
1420
1421         if (skb->len <= 76) {
1422                 return __skb_checksum_complete(skb);
1423         }
1424         return 0;
1425 }
1426
1427
1428 /* The socket must have it's spinlock held when we get
1429  * here.
1430  *
1431  * We have a potential double-lock case here, so even when
1432  * doing backlog processing we use the BH locking scheme.
1433  * This is because we cannot sleep with the original spinlock
1434  * held.
1435  */
1436 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1437 {
1438         struct sock *rsk;
1439 #ifdef CONFIG_TCP_MD5SIG
1440         /*
1441          * We really want to reject the packet as early as possible
1442          * if:
1443          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1444          *  o There is an MD5 option and we're not expecting one
1445          */
1446         if (tcp_v4_inbound_md5_hash(sk, skb))
1447                 goto discard;
1448 #endif
1449
1450         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1451                 TCP_CHECK_TIMER(sk);
1452                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1453                         rsk = sk;
1454                         goto reset;
1455                 }
1456                 TCP_CHECK_TIMER(sk);
1457                 return 0;
1458         }
1459
1460         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1461                 goto csum_err;
1462
1463         if (sk->sk_state == TCP_LISTEN) {
1464                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1465                 if (!nsk)
1466                         goto discard;
1467
1468                 if (nsk != sk) {
1469                         if (tcp_child_process(sk, nsk, skb)) {
1470                                 rsk = nsk;
1471                                 goto reset;
1472                         }
1473                         return 0;
1474                 }
1475         }
1476
1477         TCP_CHECK_TIMER(sk);
1478         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1479                 rsk = sk;
1480                 goto reset;
1481         }
1482         TCP_CHECK_TIMER(sk);
1483         return 0;
1484
1485 reset:
1486         tcp_v4_send_reset(rsk, skb);
1487 discard:
1488         kfree_skb(skb);
1489         /* Be careful here. If this function gets more complicated and
1490          * gcc suffers from register pressure on the x86, sk (in %ebx)
1491          * might be destroyed here. This current version compiles correctly,
1492          * but you have been warned.
1493          */
1494         return 0;
1495
1496 csum_err:
1497         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1498         goto discard;
1499 }
1500
1501 /*
1502  *      From tcp_input.c
1503  */
1504
1505 int tcp_v4_rcv(struct sk_buff *skb)
1506 {
1507         const struct iphdr *iph;
1508         struct tcphdr *th;
1509         struct sock *sk;
1510         int ret;
1511
1512         if (skb->pkt_type != PACKET_HOST)
1513                 goto discard_it;
1514
1515         /* Count it even if it's bad */
1516         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1517
1518         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1519                 goto discard_it;
1520
1521         th = tcp_hdr(skb);
1522
1523         if (th->doff < sizeof(struct tcphdr) / 4)
1524                 goto bad_packet;
1525         if (!pskb_may_pull(skb, th->doff * 4))
1526                 goto discard_it;
1527
1528         /* An explanation is required here, I think.
1529          * Packet length and doff are validated by header prediction,
1530          * provided case of th->doff==0 is eliminated.
1531          * So, we defer the checks. */
1532         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1533                 goto bad_packet;
1534
1535         th = tcp_hdr(skb);
1536         iph = ip_hdr(skb);
1537         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1538         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1539                                     skb->len - th->doff * 4);
1540         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1541         TCP_SKB_CB(skb)->when    = 0;
1542         TCP_SKB_CB(skb)->flags   = iph->tos;
1543         TCP_SKB_CB(skb)->sacked  = 0;
1544
1545         sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr,
1546                         th->source, iph->daddr, th->dest, inet_iif(skb));
1547         if (!sk)
1548                 goto no_tcp_socket;
1549
1550 process:
1551         if (sk->sk_state == TCP_TIME_WAIT)
1552                 goto do_time_wait;
1553
1554         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1555                 goto discard_and_relse;
1556         nf_reset(skb);
1557
1558         if (sk_filter(sk, skb))
1559                 goto discard_and_relse;
1560
1561         skb->dev = NULL;
1562
1563         bh_lock_sock_nested(sk);
1564         ret = 0;
1565         if (!sock_owned_by_user(sk)) {
1566 #ifdef CONFIG_NET_DMA
1567                 struct tcp_sock *tp = tcp_sk(sk);
1568                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1569                         tp->ucopy.dma_chan = get_softnet_dma();
1570                 if (tp->ucopy.dma_chan)
1571                         ret = tcp_v4_do_rcv(sk, skb);
1572                 else
1573 #endif
1574                 {
1575                         if (!tcp_prequeue(sk, skb))
1576                         ret = tcp_v4_do_rcv(sk, skb);
1577                 }
1578         } else
1579                 sk_add_backlog(sk, skb);
1580         bh_unlock_sock(sk);
1581
1582         sock_put(sk);
1583
1584         return ret;
1585
1586 no_tcp_socket:
1587         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1588                 goto discard_it;
1589
1590         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1591 bad_packet:
1592                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1593         } else {
1594                 tcp_v4_send_reset(NULL, skb);
1595         }
1596
1597 discard_it:
1598         /* Discard frame. */
1599         kfree_skb(skb);
1600         return 0;
1601
1602 discard_and_relse:
1603         sock_put(sk);
1604         goto discard_it;
1605
1606 do_time_wait:
1607         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1608                 inet_twsk_put(inet_twsk(sk));
1609                 goto discard_it;
1610         }
1611
1612         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1613                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1614                 inet_twsk_put(inet_twsk(sk));
1615                 goto discard_it;
1616         }
1617         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1618         case TCP_TW_SYN: {
1619                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1620                                                         &tcp_hashinfo,
1621                                                         iph->daddr, th->dest,
1622                                                         inet_iif(skb));
1623                 if (sk2) {
1624                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1625                         inet_twsk_put(inet_twsk(sk));
1626                         sk = sk2;
1627                         goto process;
1628                 }
1629                 /* Fall through to ACK */
1630         }
1631         case TCP_TW_ACK:
1632                 tcp_v4_timewait_ack(sk, skb);
1633                 break;
1634         case TCP_TW_RST:
1635                 goto no_tcp_socket;
1636         case TCP_TW_SUCCESS:;
1637         }
1638         goto discard_it;
1639 }
1640
1641 /* VJ's idea. Save last timestamp seen from this destination
1642  * and hold it at least for normal timewait interval to use for duplicate
1643  * segment detection in subsequent connections, before they enter synchronized
1644  * state.
1645  */
1646
1647 int tcp_v4_remember_stamp(struct sock *sk)
1648 {
1649         struct inet_sock *inet = inet_sk(sk);
1650         struct tcp_sock *tp = tcp_sk(sk);
1651         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1652         struct inet_peer *peer = NULL;
1653         int release_it = 0;
1654
1655         if (!rt || rt->rt_dst != inet->daddr) {
1656                 peer = inet_getpeer(inet->daddr, 1);
1657                 release_it = 1;
1658         } else {
1659                 if (!rt->peer)
1660                         rt_bind_peer(rt, 1);
1661                 peer = rt->peer;
1662         }
1663
1664         if (peer) {
1665                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1666                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1667                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1668                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1669                         peer->tcp_ts = tp->rx_opt.ts_recent;
1670                 }
1671                 if (release_it)
1672                         inet_putpeer(peer);
1673                 return 1;
1674         }
1675
1676         return 0;
1677 }
1678
1679 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1680 {
1681         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1682
1683         if (peer) {
1684                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1685
1686                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1687                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1688                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1689                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1690                         peer->tcp_ts       = tcptw->tw_ts_recent;
1691                 }
1692                 inet_putpeer(peer);
1693                 return 1;
1694         }
1695
1696         return 0;
1697 }
1698
1699 struct inet_connection_sock_af_ops ipv4_specific = {
1700         .queue_xmit        = ip_queue_xmit,
1701         .send_check        = tcp_v4_send_check,
1702         .rebuild_header    = inet_sk_rebuild_header,
1703         .conn_request      = tcp_v4_conn_request,
1704         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1705         .remember_stamp    = tcp_v4_remember_stamp,
1706         .net_header_len    = sizeof(struct iphdr),
1707         .setsockopt        = ip_setsockopt,
1708         .getsockopt        = ip_getsockopt,
1709         .addr2sockaddr     = inet_csk_addr2sockaddr,
1710         .sockaddr_len      = sizeof(struct sockaddr_in),
1711         .bind_conflict     = inet_csk_bind_conflict,
1712 #ifdef CONFIG_COMPAT
1713         .compat_setsockopt = compat_ip_setsockopt,
1714         .compat_getsockopt = compat_ip_getsockopt,
1715 #endif
1716 };
1717
1718 #ifdef CONFIG_TCP_MD5SIG
1719 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1720         .md5_lookup             = tcp_v4_md5_lookup,
1721         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1722         .md5_add                = tcp_v4_md5_add_func,
1723         .md5_parse              = tcp_v4_parse_md5_keys,
1724 };
1725 #endif
1726
1727 /* NOTE: A lot of things set to zero explicitly by call to
1728  *       sk_alloc() so need not be done here.
1729  */
1730 static int tcp_v4_init_sock(struct sock *sk)
1731 {
1732         struct inet_connection_sock *icsk = inet_csk(sk);
1733         struct tcp_sock *tp = tcp_sk(sk);
1734
1735         skb_queue_head_init(&tp->out_of_order_queue);
1736         tcp_init_xmit_timers(sk);
1737         tcp_prequeue_init(tp);
1738
1739         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1740         tp->mdev = TCP_TIMEOUT_INIT;
1741
1742         /* So many TCP implementations out there (incorrectly) count the
1743          * initial SYN frame in their delayed-ACK and congestion control
1744          * algorithms that we must have the following bandaid to talk
1745          * efficiently to them.  -DaveM
1746          */
1747         tp->snd_cwnd = 2;
1748
1749         /* See draft-stevens-tcpca-spec-01 for discussion of the
1750          * initialization of these values.
1751          */
1752         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1753         tp->snd_cwnd_clamp = ~0;
1754         tp->mss_cache = 536;
1755
1756         tp->reordering = sysctl_tcp_reordering;
1757         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1758
1759         sk->sk_state = TCP_CLOSE;
1760
1761         sk->sk_write_space = sk_stream_write_space;
1762         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1763
1764         icsk->icsk_af_ops = &ipv4_specific;
1765         icsk->icsk_sync_mss = tcp_sync_mss;
1766 #ifdef CONFIG_TCP_MD5SIG
1767         tp->af_specific = &tcp_sock_ipv4_specific;
1768 #endif
1769
1770         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1771         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1772
1773         atomic_inc(&tcp_sockets_allocated);
1774
1775         return 0;
1776 }
1777
1778 int tcp_v4_destroy_sock(struct sock *sk)
1779 {
1780         struct tcp_sock *tp = tcp_sk(sk);
1781
1782         tcp_clear_xmit_timers(sk);
1783
1784         tcp_cleanup_congestion_control(sk);
1785
1786         /* Cleanup up the write buffer. */
1787         tcp_write_queue_purge(sk);
1788
1789         /* Cleans up our, hopefully empty, out_of_order_queue. */
1790         __skb_queue_purge(&tp->out_of_order_queue);
1791
1792 #ifdef CONFIG_TCP_MD5SIG
1793         /* Clean up the MD5 key list, if any */
1794         if (tp->md5sig_info) {
1795                 tcp_v4_clear_md5_list(sk);
1796                 kfree(tp->md5sig_info);
1797                 tp->md5sig_info = NULL;
1798         }
1799 #endif
1800
1801 #ifdef CONFIG_NET_DMA
1802         /* Cleans up our sk_async_wait_queue */
1803         __skb_queue_purge(&sk->sk_async_wait_queue);
1804 #endif
1805
1806         /* Clean prequeue, it must be empty really */
1807         __skb_queue_purge(&tp->ucopy.prequeue);
1808
1809         /* Clean up a referenced TCP bind bucket. */
1810         if (inet_csk(sk)->icsk_bind_hash)
1811                 inet_put_port(sk);
1812
1813         /*
1814          * If sendmsg cached page exists, toss it.
1815          */
1816         if (sk->sk_sndmsg_page) {
1817                 __free_page(sk->sk_sndmsg_page);
1818                 sk->sk_sndmsg_page = NULL;
1819         }
1820
1821         if (tp->defer_tcp_accept.request) {
1822                 reqsk_free(tp->defer_tcp_accept.request);
1823                 sock_put(tp->defer_tcp_accept.listen_sk);
1824                 sock_put(sk);
1825                 tp->defer_tcp_accept.listen_sk = NULL;
1826                 tp->defer_tcp_accept.request = NULL;
1827         }
1828
1829         atomic_dec(&tcp_sockets_allocated);
1830
1831         return 0;
1832 }
1833
1834 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1835
1836 #ifdef CONFIG_PROC_FS
1837 /* Proc filesystem TCP sock list dumping. */
1838
1839 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1840 {
1841         return hlist_empty(head) ? NULL :
1842                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1843 }
1844
1845 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1846 {
1847         return tw->tw_node.next ?
1848                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1849 }
1850
1851 static void *listening_get_next(struct seq_file *seq, void *cur)
1852 {
1853         struct inet_connection_sock *icsk;
1854         struct hlist_node *node;
1855         struct sock *sk = cur;
1856         struct tcp_iter_state* st = seq->private;
1857         struct net *net = seq_file_net(seq);
1858
1859         if (!sk) {
1860                 st->bucket = 0;
1861                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1862                 goto get_sk;
1863         }
1864
1865         ++st->num;
1866
1867         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1868                 struct request_sock *req = cur;
1869
1870                 icsk = inet_csk(st->syn_wait_sk);
1871                 req = req->dl_next;
1872                 while (1) {
1873                         while (req) {
1874                                 if (req->rsk_ops->family == st->family &&
1875                                     net_eq(sock_net(req->sk), net)) {
1876                                         cur = req;
1877                                         goto out;
1878                                 }
1879                                 req = req->dl_next;
1880                         }
1881                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1882                                 break;
1883 get_req:
1884                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1885                 }
1886                 sk        = sk_next(st->syn_wait_sk);
1887                 st->state = TCP_SEQ_STATE_LISTENING;
1888                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1889         } else {
1890                 icsk = inet_csk(sk);
1891                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1892                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1893                         goto start_req;
1894                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1895                 sk = sk_next(sk);
1896         }
1897 get_sk:
1898         sk_for_each_from(sk, node) {
1899                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1900                         cur = sk;
1901                         goto out;
1902                 }
1903                 icsk = inet_csk(sk);
1904                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1905                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1906 start_req:
1907                         st->uid         = sock_i_uid(sk);
1908                         st->syn_wait_sk = sk;
1909                         st->state       = TCP_SEQ_STATE_OPENREQ;
1910                         st->sbucket     = 0;
1911                         goto get_req;
1912                 }
1913                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1914         }
1915         if (++st->bucket < INET_LHTABLE_SIZE) {
1916                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1917                 goto get_sk;
1918         }
1919         cur = NULL;
1920 out:
1921         return cur;
1922 }
1923
1924 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1925 {
1926         void *rc = listening_get_next(seq, NULL);
1927
1928         while (rc && *pos) {
1929                 rc = listening_get_next(seq, rc);
1930                 --*pos;
1931         }
1932         return rc;
1933 }
1934
1935 static void *established_get_first(struct seq_file *seq)
1936 {
1937         struct tcp_iter_state* st = seq->private;
1938         struct net *net = seq_file_net(seq);
1939         void *rc = NULL;
1940
1941         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1942                 struct sock *sk;
1943                 struct hlist_node *node;
1944                 struct inet_timewait_sock *tw;
1945                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1946
1947                 read_lock_bh(lock);
1948                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1949                         if (sk->sk_family != st->family ||
1950                             !net_eq(sock_net(sk), net)) {
1951                                 continue;
1952                         }
1953                         rc = sk;
1954                         goto out;
1955                 }
1956                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1957                 inet_twsk_for_each(tw, node,
1958                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
1959                         if (tw->tw_family != st->family ||
1960                             !net_eq(twsk_net(tw), net)) {
1961                                 continue;
1962                         }
1963                         rc = tw;
1964                         goto out;
1965                 }
1966                 read_unlock_bh(lock);
1967                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1968         }
1969 out:
1970         return rc;
1971 }
1972
1973 static void *established_get_next(struct seq_file *seq, void *cur)
1974 {
1975         struct sock *sk = cur;
1976         struct inet_timewait_sock *tw;
1977         struct hlist_node *node;
1978         struct tcp_iter_state* st = seq->private;
1979         struct net *net = seq_file_net(seq);
1980
1981         ++st->num;
1982
1983         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1984                 tw = cur;
1985                 tw = tw_next(tw);
1986 get_tw:
1987                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
1988                         tw = tw_next(tw);
1989                 }
1990                 if (tw) {
1991                         cur = tw;
1992                         goto out;
1993                 }
1994                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1995                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1996
1997                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1998                         read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1999                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2000                 } else {
2001                         cur = NULL;
2002                         goto out;
2003                 }
2004         } else
2005                 sk = sk_next(sk);
2006
2007         sk_for_each_from(sk, node) {
2008                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2009                         goto found;
2010         }
2011
2012         st->state = TCP_SEQ_STATE_TIME_WAIT;
2013         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2014         goto get_tw;
2015 found:
2016         cur = sk;
2017 out:
2018         return cur;
2019 }
2020
2021 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2022 {
2023         void *rc = established_get_first(seq);
2024
2025         while (rc && pos) {
2026                 rc = established_get_next(seq, rc);
2027                 --pos;
2028         }
2029         return rc;
2030 }
2031
2032 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2033 {
2034         void *rc;
2035         struct tcp_iter_state* st = seq->private;
2036
2037         inet_listen_lock(&tcp_hashinfo);
2038         st->state = TCP_SEQ_STATE_LISTENING;
2039         rc        = listening_get_idx(seq, &pos);
2040
2041         if (!rc) {
2042                 inet_listen_unlock(&tcp_hashinfo);
2043                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2044                 rc        = established_get_idx(seq, pos);
2045         }
2046
2047         return rc;
2048 }
2049
2050 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2051 {
2052         struct tcp_iter_state* st = seq->private;
2053         st->state = TCP_SEQ_STATE_LISTENING;
2054         st->num = 0;
2055         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2056 }
2057
2058 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2059 {
2060         void *rc = NULL;
2061         struct tcp_iter_state* st;
2062
2063         if (v == SEQ_START_TOKEN) {
2064                 rc = tcp_get_idx(seq, 0);
2065                 goto out;
2066         }
2067         st = seq->private;
2068
2069         switch (st->state) {
2070         case TCP_SEQ_STATE_OPENREQ:
2071         case TCP_SEQ_STATE_LISTENING:
2072                 rc = listening_get_next(seq, v);
2073                 if (!rc) {
2074                         inet_listen_unlock(&tcp_hashinfo);
2075                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2076                         rc        = established_get_first(seq);
2077                 }
2078                 break;
2079         case TCP_SEQ_STATE_ESTABLISHED:
2080         case TCP_SEQ_STATE_TIME_WAIT:
2081                 rc = established_get_next(seq, v);
2082                 break;
2083         }
2084 out:
2085         ++*pos;
2086         return rc;
2087 }
2088
2089 static void tcp_seq_stop(struct seq_file *seq, void *v)
2090 {
2091         struct tcp_iter_state* st = seq->private;
2092
2093         switch (st->state) {
2094         case TCP_SEQ_STATE_OPENREQ:
2095                 if (v) {
2096                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2097                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2098                 }
2099         case TCP_SEQ_STATE_LISTENING:
2100                 if (v != SEQ_START_TOKEN)
2101                         inet_listen_unlock(&tcp_hashinfo);
2102                 break;
2103         case TCP_SEQ_STATE_TIME_WAIT:
2104         case TCP_SEQ_STATE_ESTABLISHED:
2105                 if (v)
2106                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2107                 break;
2108         }
2109 }
2110
2111 static int tcp_seq_open(struct inode *inode, struct file *file)
2112 {
2113         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2114         struct tcp_iter_state *s;
2115         int err;
2116
2117         err = seq_open_net(inode, file, &afinfo->seq_ops,
2118                           sizeof(struct tcp_iter_state));
2119         if (err < 0)
2120                 return err;
2121
2122         s = ((struct seq_file *)file->private_data)->private;
2123         s->family               = afinfo->family;
2124         return 0;
2125 }
2126
2127 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2128 {
2129         int rc = 0;
2130         struct proc_dir_entry *p;
2131
2132         afinfo->seq_fops.open           = tcp_seq_open;
2133         afinfo->seq_fops.read           = seq_read;
2134         afinfo->seq_fops.llseek         = seq_lseek;
2135         afinfo->seq_fops.release        = seq_release_net;
2136
2137         afinfo->seq_ops.start           = tcp_seq_start;
2138         afinfo->seq_ops.next            = tcp_seq_next;
2139         afinfo->seq_ops.stop            = tcp_seq_stop;
2140
2141         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2142                              &afinfo->seq_fops, afinfo);
2143         if (!p)
2144                 rc = -ENOMEM;
2145         return rc;
2146 }
2147
2148 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2149 {
2150         proc_net_remove(net, afinfo->name);
2151 }
2152
2153 static void get_openreq4(struct sock *sk, struct request_sock *req,
2154                          struct seq_file *f, int i, int uid, int *len)
2155 {
2156         const struct inet_request_sock *ireq = inet_rsk(req);
2157         int ttd = req->expires - jiffies;
2158
2159         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2160                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2161                 i,
2162                 ireq->loc_addr,
2163                 ntohs(inet_sk(sk)->sport),
2164                 ireq->rmt_addr,
2165                 ntohs(ireq->rmt_port),
2166                 TCP_SYN_RECV,
2167                 0, 0, /* could print option size, but that is af dependent. */
2168                 1,    /* timers active (only the expire timer) */
2169                 jiffies_to_clock_t(ttd),
2170                 req->retrans,
2171                 uid,
2172                 0,  /* non standard timer */
2173                 0, /* open_requests have no inode */
2174                 atomic_read(&sk->sk_refcnt),
2175                 req,
2176                 len);
2177 }
2178
2179 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2180 {
2181         int timer_active;
2182         unsigned long timer_expires;
2183         struct tcp_sock *tp = tcp_sk(sk);
2184         const struct inet_connection_sock *icsk = inet_csk(sk);
2185         struct inet_sock *inet = inet_sk(sk);
2186         __be32 dest = inet->daddr;
2187         __be32 src = inet->rcv_saddr;
2188         __u16 destp = ntohs(inet->dport);
2189         __u16 srcp = ntohs(inet->sport);
2190
2191         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2192                 timer_active    = 1;
2193                 timer_expires   = icsk->icsk_timeout;
2194         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2195                 timer_active    = 4;
2196                 timer_expires   = icsk->icsk_timeout;
2197         } else if (timer_pending(&sk->sk_timer)) {
2198                 timer_active    = 2;
2199                 timer_expires   = sk->sk_timer.expires;
2200         } else {
2201                 timer_active    = 0;
2202                 timer_expires = jiffies;
2203         }
2204
2205         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2206                         "%08X %5d %8d %lu %d %p %u %u %u %u %d%n",
2207                 i, src, srcp, dest, destp, sk->sk_state,
2208                 tp->write_seq - tp->snd_una,
2209                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2210                                              (tp->rcv_nxt - tp->copied_seq),
2211                 timer_active,
2212                 jiffies_to_clock_t(timer_expires - jiffies),
2213                 icsk->icsk_retransmits,
2214                 sock_i_uid(sk),
2215                 icsk->icsk_probes_out,
2216                 sock_i_ino(sk),
2217                 atomic_read(&sk->sk_refcnt), sk,
2218                 icsk->icsk_rto,
2219                 icsk->icsk_ack.ato,
2220                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2221                 tp->snd_cwnd,
2222                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2223                 len);
2224 }
2225
2226 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2227                                struct seq_file *f, int i, int *len)
2228 {
2229         __be32 dest, src;
2230         __u16 destp, srcp;
2231         int ttd = tw->tw_ttd - jiffies;
2232
2233         if (ttd < 0)
2234                 ttd = 0;
2235
2236         dest  = tw->tw_daddr;
2237         src   = tw->tw_rcv_saddr;
2238         destp = ntohs(tw->tw_dport);
2239         srcp  = ntohs(tw->tw_sport);
2240
2241         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2242                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2243                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2244                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2245                 atomic_read(&tw->tw_refcnt), tw, len);
2246 }
2247
2248 #define TMPSZ 150
2249
2250 static int tcp4_seq_show(struct seq_file *seq, void *v)
2251 {
2252         struct tcp_iter_state* st;
2253         int len;
2254
2255         if (v == SEQ_START_TOKEN) {
2256                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2257                            "  sl  local_address rem_address   st tx_queue "
2258                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2259                            "inode");
2260                 goto out;
2261         }
2262         st = seq->private;
2263
2264         switch (st->state) {
2265         case TCP_SEQ_STATE_LISTENING:
2266         case TCP_SEQ_STATE_ESTABLISHED:
2267                 get_tcp4_sock(v, seq, st->num, &len);
2268                 break;
2269         case TCP_SEQ_STATE_OPENREQ:
2270                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2271                 break;
2272         case TCP_SEQ_STATE_TIME_WAIT:
2273                 get_timewait4_sock(v, seq, st->num, &len);
2274                 break;
2275         }
2276         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2277 out:
2278         return 0;
2279 }
2280
2281 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2282         .name           = "tcp",
2283         .family         = AF_INET,
2284         .seq_fops       = {
2285                 .owner          = THIS_MODULE,
2286         },
2287         .seq_ops        = {
2288                 .show           = tcp4_seq_show,
2289         },
2290 };
2291
2292 static int tcp4_proc_init_net(struct net *net)
2293 {
2294         return tcp_proc_register(net, &tcp4_seq_afinfo);
2295 }
2296
2297 static void tcp4_proc_exit_net(struct net *net)
2298 {
2299         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2300 }
2301
2302 static struct pernet_operations tcp4_net_ops = {
2303         .init = tcp4_proc_init_net,
2304         .exit = tcp4_proc_exit_net,
2305 };
2306
2307 int __init tcp4_proc_init(void)
2308 {
2309         return register_pernet_subsys(&tcp4_net_ops);
2310 }
2311
2312 void tcp4_proc_exit(void)
2313 {
2314         unregister_pernet_subsys(&tcp4_net_ops);
2315 }
2316 #endif /* CONFIG_PROC_FS */
2317
2318 struct proto tcp_prot = {
2319         .name                   = "TCP",
2320         .owner                  = THIS_MODULE,
2321         .close                  = tcp_close,
2322         .connect                = tcp_v4_connect,
2323         .disconnect             = tcp_disconnect,
2324         .accept                 = inet_csk_accept,
2325         .ioctl                  = tcp_ioctl,
2326         .init                   = tcp_v4_init_sock,
2327         .destroy                = tcp_v4_destroy_sock,
2328         .shutdown               = tcp_shutdown,
2329         .setsockopt             = tcp_setsockopt,
2330         .getsockopt             = tcp_getsockopt,
2331         .recvmsg                = tcp_recvmsg,
2332         .backlog_rcv            = tcp_v4_do_rcv,
2333         .hash                   = inet_hash,
2334         .unhash                 = inet_unhash,
2335         .get_port               = inet_csk_get_port,
2336         .enter_memory_pressure  = tcp_enter_memory_pressure,
2337         .sockets_allocated      = &tcp_sockets_allocated,
2338         .orphan_count           = &tcp_orphan_count,
2339         .memory_allocated       = &tcp_memory_allocated,
2340         .memory_pressure        = &tcp_memory_pressure,
2341         .sysctl_mem             = sysctl_tcp_mem,
2342         .sysctl_wmem            = sysctl_tcp_wmem,
2343         .sysctl_rmem            = sysctl_tcp_rmem,
2344         .max_header             = MAX_TCP_HEADER,
2345         .obj_size               = sizeof(struct tcp_sock),
2346         .twsk_prot              = &tcp_timewait_sock_ops,
2347         .rsk_prot               = &tcp_request_sock_ops,
2348         .h.hashinfo             = &tcp_hashinfo,
2349 #ifdef CONFIG_COMPAT
2350         .compat_setsockopt      = compat_tcp_setsockopt,
2351         .compat_getsockopt      = compat_tcp_getsockopt,
2352 #endif
2353 };
2354
2355
2356 static int __net_init tcp_sk_init(struct net *net)
2357 {
2358         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2359                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2360 }
2361
2362 static void __net_exit tcp_sk_exit(struct net *net)
2363 {
2364         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2365 }
2366
2367 static struct pernet_operations __net_initdata tcp_sk_ops = {
2368        .init = tcp_sk_init,
2369        .exit = tcp_sk_exit,
2370 };
2371
2372 void __init tcp_v4_init(void)
2373 {
2374         if (register_pernet_device(&tcp_sk_ops))
2375                 panic("Failed to create the TCP control socket.\n");
2376 }
2377
2378 EXPORT_SYMBOL(ipv4_specific);
2379 EXPORT_SYMBOL(tcp_hashinfo);
2380 EXPORT_SYMBOL(tcp_prot);
2381 EXPORT_SYMBOL(tcp_v4_conn_request);
2382 EXPORT_SYMBOL(tcp_v4_connect);
2383 EXPORT_SYMBOL(tcp_v4_do_rcv);
2384 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2385 EXPORT_SYMBOL(tcp_v4_send_check);
2386 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2387
2388 #ifdef CONFIG_PROC_FS
2389 EXPORT_SYMBOL(tcp_proc_register);
2390 EXPORT_SYMBOL(tcp_proc_unregister);
2391 #endif
2392 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2393