tcp md5sig: Share MD5 Signature option parser between IPv4 and IPv6.
[safe/jmp/linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen semantics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81
82 #include <linux/crypto.h>
83 #include <linux/scatterlist.h>
84
85 int sysctl_tcp_tw_reuse __read_mostly;
86 int sysctl_tcp_low_latency __read_mostly;
87
88 /* Check TCP sequence numbers in ICMP packets. */
89 #define ICMP_MIN_LENGTH 8
90
91 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
92
93 #ifdef CONFIG_TCP_MD5SIG
94 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
95                                                    __be32 addr);
96 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
97                                    __be32 saddr, __be32 daddr,
98                                    struct tcphdr *th, int protocol,
99                                    unsigned int tcplen);
100 #endif
101
102 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
103         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
104         .lhash_users = ATOMIC_INIT(0),
105         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
106 };
107
108 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
109 {
110         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
111                                           ip_hdr(skb)->saddr,
112                                           tcp_hdr(skb)->dest,
113                                           tcp_hdr(skb)->source);
114 }
115
116 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
117 {
118         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
119         struct tcp_sock *tp = tcp_sk(sk);
120
121         /* With PAWS, it is safe from the viewpoint
122            of data integrity. Even without PAWS it is safe provided sequence
123            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
124
125            Actually, the idea is close to VJ's one, only timestamp cache is
126            held not per host, but per port pair and TW bucket is used as state
127            holder.
128
129            If TW bucket has been already destroyed we fall back to VJ's scheme
130            and use initial timestamp retrieved from peer table.
131          */
132         if (tcptw->tw_ts_recent_stamp &&
133             (twp == NULL || (sysctl_tcp_tw_reuse &&
134                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
135                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
136                 if (tp->write_seq == 0)
137                         tp->write_seq = 1;
138                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
139                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
140                 sock_hold(sktw);
141                 return 1;
142         }
143
144         return 0;
145 }
146
147 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
148
149 /* This will initiate an outgoing connection. */
150 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
151 {
152         struct inet_sock *inet = inet_sk(sk);
153         struct tcp_sock *tp = tcp_sk(sk);
154         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
155         struct rtable *rt;
156         __be32 daddr, nexthop;
157         int tmp;
158         int err;
159
160         if (addr_len < sizeof(struct sockaddr_in))
161                 return -EINVAL;
162
163         if (usin->sin_family != AF_INET)
164                 return -EAFNOSUPPORT;
165
166         nexthop = daddr = usin->sin_addr.s_addr;
167         if (inet->opt && inet->opt->srr) {
168                 if (!daddr)
169                         return -EINVAL;
170                 nexthop = inet->opt->faddr;
171         }
172
173         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
174                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175                                IPPROTO_TCP,
176                                inet->sport, usin->sin_port, sk, 1);
177         if (tmp < 0) {
178                 if (tmp == -ENETUNREACH)
179                         IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
180                 return tmp;
181         }
182
183         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
184                 ip_rt_put(rt);
185                 return -ENETUNREACH;
186         }
187
188         if (!inet->opt || !inet->opt->srr)
189                 daddr = rt->rt_dst;
190
191         if (!inet->saddr)
192                 inet->saddr = rt->rt_src;
193         inet->rcv_saddr = inet->saddr;
194
195         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
196                 /* Reset inherited state */
197                 tp->rx_opt.ts_recent       = 0;
198                 tp->rx_opt.ts_recent_stamp = 0;
199                 tp->write_seq              = 0;
200         }
201
202         if (tcp_death_row.sysctl_tw_recycle &&
203             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
204                 struct inet_peer *peer = rt_get_peer(rt);
205                 /*
206                  * VJ's idea. We save last timestamp seen from
207                  * the destination in peer table, when entering state
208                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
209                  * when trying new connection.
210                  */
211                 if (peer != NULL &&
212                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
213                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
214                         tp->rx_opt.ts_recent = peer->tcp_ts;
215                 }
216         }
217
218         inet->dport = usin->sin_port;
219         inet->daddr = daddr;
220
221         inet_csk(sk)->icsk_ext_hdr_len = 0;
222         if (inet->opt)
223                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
224
225         tp->rx_opt.mss_clamp = 536;
226
227         /* Socket identity is still unknown (sport may be zero).
228          * However we set state to SYN-SENT and not releasing socket
229          * lock select source port, enter ourselves into the hash tables and
230          * complete initialization after this.
231          */
232         tcp_set_state(sk, TCP_SYN_SENT);
233         err = inet_hash_connect(&tcp_death_row, sk);
234         if (err)
235                 goto failure;
236
237         err = ip_route_newports(&rt, IPPROTO_TCP,
238                                 inet->sport, inet->dport, sk);
239         if (err)
240                 goto failure;
241
242         /* OK, now commit destination to socket.  */
243         sk->sk_gso_type = SKB_GSO_TCPV4;
244         sk_setup_caps(sk, &rt->u.dst);
245
246         if (!tp->write_seq)
247                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
248                                                            inet->daddr,
249                                                            inet->sport,
250                                                            usin->sin_port);
251
252         inet->id = tp->write_seq ^ jiffies;
253
254         err = tcp_connect(sk);
255         rt = NULL;
256         if (err)
257                 goto failure;
258
259         return 0;
260
261 failure:
262         /*
263          * This unhashes the socket and releases the local port,
264          * if necessary.
265          */
266         tcp_set_state(sk, TCP_CLOSE);
267         ip_rt_put(rt);
268         sk->sk_route_caps = 0;
269         inet->dport = 0;
270         return err;
271 }
272
273 /*
274  * This routine does path mtu discovery as defined in RFC1191.
275  */
276 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
277 {
278         struct dst_entry *dst;
279         struct inet_sock *inet = inet_sk(sk);
280
281         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
282          * send out by Linux are always <576bytes so they should go through
283          * unfragmented).
284          */
285         if (sk->sk_state == TCP_LISTEN)
286                 return;
287
288         /* We don't check in the destentry if pmtu discovery is forbidden
289          * on this route. We just assume that no packet_to_big packets
290          * are send back when pmtu discovery is not active.
291          * There is a small race when the user changes this flag in the
292          * route, but I think that's acceptable.
293          */
294         if ((dst = __sk_dst_check(sk, 0)) == NULL)
295                 return;
296
297         dst->ops->update_pmtu(dst, mtu);
298
299         /* Something is about to be wrong... Remember soft error
300          * for the case, if this connection will not able to recover.
301          */
302         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
303                 sk->sk_err_soft = EMSGSIZE;
304
305         mtu = dst_mtu(dst);
306
307         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
308             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
309                 tcp_sync_mss(sk, mtu);
310
311                 /* Resend the TCP packet because it's
312                  * clear that the old packet has been
313                  * dropped. This is the new "fast" path mtu
314                  * discovery.
315                  */
316                 tcp_simple_retransmit(sk);
317         } /* else let the usual retransmit timer handle it */
318 }
319
320 /*
321  * This routine is called by the ICMP module when it gets some
322  * sort of error condition.  If err < 0 then the socket should
323  * be closed and the error returned to the user.  If err > 0
324  * it's just the icmp type << 8 | icmp code.  After adjustment
325  * header points to the first 8 bytes of the tcp header.  We need
326  * to find the appropriate port.
327  *
328  * The locking strategy used here is very "optimistic". When
329  * someone else accesses the socket the ICMP is just dropped
330  * and for some paths there is no check at all.
331  * A more general error queue to queue errors for later handling
332  * is probably better.
333  *
334  */
335
336 void tcp_v4_err(struct sk_buff *skb, u32 info)
337 {
338         struct iphdr *iph = (struct iphdr *)skb->data;
339         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
340         struct tcp_sock *tp;
341         struct inet_sock *inet;
342         const int type = icmp_hdr(skb)->type;
343         const int code = icmp_hdr(skb)->code;
344         struct sock *sk;
345         __u32 seq;
346         int err;
347
348         if (skb->len < (iph->ihl << 2) + 8) {
349                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
350                 return;
351         }
352
353         sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest,
354                         iph->saddr, th->source, inet_iif(skb));
355         if (!sk) {
356                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
357                 return;
358         }
359         if (sk->sk_state == TCP_TIME_WAIT) {
360                 inet_twsk_put(inet_twsk(sk));
361                 return;
362         }
363
364         bh_lock_sock(sk);
365         /* If too many ICMPs get dropped on busy
366          * servers this needs to be solved differently.
367          */
368         if (sock_owned_by_user(sk))
369                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
370
371         if (sk->sk_state == TCP_CLOSE)
372                 goto out;
373
374         tp = tcp_sk(sk);
375         seq = ntohl(th->seq);
376         if (sk->sk_state != TCP_LISTEN &&
377             !between(seq, tp->snd_una, tp->snd_nxt)) {
378                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
379                 goto out;
380         }
381
382         switch (type) {
383         case ICMP_SOURCE_QUENCH:
384                 /* Just silently ignore these. */
385                 goto out;
386         case ICMP_PARAMETERPROB:
387                 err = EPROTO;
388                 break;
389         case ICMP_DEST_UNREACH:
390                 if (code > NR_ICMP_UNREACH)
391                         goto out;
392
393                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
394                         if (!sock_owned_by_user(sk))
395                                 do_pmtu_discovery(sk, iph, info);
396                         goto out;
397                 }
398
399                 err = icmp_err_convert[code].errno;
400                 break;
401         case ICMP_TIME_EXCEEDED:
402                 err = EHOSTUNREACH;
403                 break;
404         default:
405                 goto out;
406         }
407
408         switch (sk->sk_state) {
409                 struct request_sock *req, **prev;
410         case TCP_LISTEN:
411                 if (sock_owned_by_user(sk))
412                         goto out;
413
414                 req = inet_csk_search_req(sk, &prev, th->dest,
415                                           iph->daddr, iph->saddr);
416                 if (!req)
417                         goto out;
418
419                 /* ICMPs are not backlogged, hence we cannot get
420                    an established socket here.
421                  */
422                 BUG_TRAP(!req->sk);
423
424                 if (seq != tcp_rsk(req)->snt_isn) {
425                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
426                         goto out;
427                 }
428
429                 /*
430                  * Still in SYN_RECV, just remove it silently.
431                  * There is no good way to pass the error to the newly
432                  * created socket, and POSIX does not want network
433                  * errors returned from accept().
434                  */
435                 inet_csk_reqsk_queue_drop(sk, req, prev);
436                 goto out;
437
438         case TCP_SYN_SENT:
439         case TCP_SYN_RECV:  /* Cannot happen.
440                                It can f.e. if SYNs crossed.
441                              */
442                 if (!sock_owned_by_user(sk)) {
443                         sk->sk_err = err;
444
445                         sk->sk_error_report(sk);
446
447                         tcp_done(sk);
448                 } else {
449                         sk->sk_err_soft = err;
450                 }
451                 goto out;
452         }
453
454         /* If we've already connected we will keep trying
455          * until we time out, or the user gives up.
456          *
457          * rfc1122 4.2.3.9 allows to consider as hard errors
458          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
459          * but it is obsoleted by pmtu discovery).
460          *
461          * Note, that in modern internet, where routing is unreliable
462          * and in each dark corner broken firewalls sit, sending random
463          * errors ordered by their masters even this two messages finally lose
464          * their original sense (even Linux sends invalid PORT_UNREACHs)
465          *
466          * Now we are in compliance with RFCs.
467          *                                                      --ANK (980905)
468          */
469
470         inet = inet_sk(sk);
471         if (!sock_owned_by_user(sk) && inet->recverr) {
472                 sk->sk_err = err;
473                 sk->sk_error_report(sk);
474         } else  { /* Only an error on timeout */
475                 sk->sk_err_soft = err;
476         }
477
478 out:
479         bh_unlock_sock(sk);
480         sock_put(sk);
481 }
482
483 /* This routine computes an IPv4 TCP checksum. */
484 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
485 {
486         struct inet_sock *inet = inet_sk(sk);
487         struct tcphdr *th = tcp_hdr(skb);
488
489         if (skb->ip_summed == CHECKSUM_PARTIAL) {
490                 th->check = ~tcp_v4_check(len, inet->saddr,
491                                           inet->daddr, 0);
492                 skb->csum_start = skb_transport_header(skb) - skb->head;
493                 skb->csum_offset = offsetof(struct tcphdr, check);
494         } else {
495                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
496                                          csum_partial((char *)th,
497                                                       th->doff << 2,
498                                                       skb->csum));
499         }
500 }
501
502 int tcp_v4_gso_send_check(struct sk_buff *skb)
503 {
504         const struct iphdr *iph;
505         struct tcphdr *th;
506
507         if (!pskb_may_pull(skb, sizeof(*th)))
508                 return -EINVAL;
509
510         iph = ip_hdr(skb);
511         th = tcp_hdr(skb);
512
513         th->check = 0;
514         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
515         skb->csum_start = skb_transport_header(skb) - skb->head;
516         skb->csum_offset = offsetof(struct tcphdr, check);
517         skb->ip_summed = CHECKSUM_PARTIAL;
518         return 0;
519 }
520
521 /*
522  *      This routine will send an RST to the other tcp.
523  *
524  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
525  *                    for reset.
526  *      Answer: if a packet caused RST, it is not for a socket
527  *              existing in our system, if it is matched to a socket,
528  *              it is just duplicate segment or bug in other side's TCP.
529  *              So that we build reply only basing on parameters
530  *              arrived with segment.
531  *      Exception: precedence violation. We do not implement it in any case.
532  */
533
534 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
535 {
536         struct tcphdr *th = tcp_hdr(skb);
537         struct {
538                 struct tcphdr th;
539 #ifdef CONFIG_TCP_MD5SIG
540                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
541 #endif
542         } rep;
543         struct ip_reply_arg arg;
544 #ifdef CONFIG_TCP_MD5SIG
545         struct tcp_md5sig_key *key;
546 #endif
547
548         /* Never send a reset in response to a reset. */
549         if (th->rst)
550                 return;
551
552         if (skb->rtable->rt_type != RTN_LOCAL)
553                 return;
554
555         /* Swap the send and the receive. */
556         memset(&rep, 0, sizeof(rep));
557         rep.th.dest   = th->source;
558         rep.th.source = th->dest;
559         rep.th.doff   = sizeof(struct tcphdr) / 4;
560         rep.th.rst    = 1;
561
562         if (th->ack) {
563                 rep.th.seq = th->ack_seq;
564         } else {
565                 rep.th.ack = 1;
566                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
567                                        skb->len - (th->doff << 2));
568         }
569
570         memset(&arg, 0, sizeof(arg));
571         arg.iov[0].iov_base = (unsigned char *)&rep;
572         arg.iov[0].iov_len  = sizeof(rep.th);
573
574 #ifdef CONFIG_TCP_MD5SIG
575         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
576         if (key) {
577                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
578                                    (TCPOPT_NOP << 16) |
579                                    (TCPOPT_MD5SIG << 8) |
580                                    TCPOLEN_MD5SIG);
581                 /* Update length and the length the header thinks exists */
582                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
583                 rep.th.doff = arg.iov[0].iov_len / 4;
584
585                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
586                                         key,
587                                         ip_hdr(skb)->daddr,
588                                         ip_hdr(skb)->saddr,
589                                         &rep.th, IPPROTO_TCP,
590                                         arg.iov[0].iov_len);
591         }
592 #endif
593         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
594                                       ip_hdr(skb)->saddr, /* XXX */
595                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
596         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
597
598         ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb,
599                       &arg, arg.iov[0].iov_len);
600
601         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
602         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
603 }
604
605 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
606    outside socket context is ugly, certainly. What can I do?
607  */
608
609 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
610                             struct sk_buff *skb, u32 seq, u32 ack,
611                             u32 win, u32 ts)
612 {
613         struct tcphdr *th = tcp_hdr(skb);
614         struct {
615                 struct tcphdr th;
616                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
617 #ifdef CONFIG_TCP_MD5SIG
618                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
619 #endif
620                         ];
621         } rep;
622         struct ip_reply_arg arg;
623 #ifdef CONFIG_TCP_MD5SIG
624         struct tcp_md5sig_key *key;
625         struct tcp_md5sig_key tw_key;
626 #endif
627
628         memset(&rep.th, 0, sizeof(struct tcphdr));
629         memset(&arg, 0, sizeof(arg));
630
631         arg.iov[0].iov_base = (unsigned char *)&rep;
632         arg.iov[0].iov_len  = sizeof(rep.th);
633         if (ts) {
634                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
635                                    (TCPOPT_TIMESTAMP << 8) |
636                                    TCPOLEN_TIMESTAMP);
637                 rep.opt[1] = htonl(tcp_time_stamp);
638                 rep.opt[2] = htonl(ts);
639                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
640         }
641
642         /* Swap the send and the receive. */
643         rep.th.dest    = th->source;
644         rep.th.source  = th->dest;
645         rep.th.doff    = arg.iov[0].iov_len / 4;
646         rep.th.seq     = htonl(seq);
647         rep.th.ack_seq = htonl(ack);
648         rep.th.ack     = 1;
649         rep.th.window  = htons(win);
650
651 #ifdef CONFIG_TCP_MD5SIG
652         /*
653          * The SKB holds an imcoming packet, but may not have a valid ->sk
654          * pointer. This is especially the case when we're dealing with a
655          * TIME_WAIT ack, because the sk structure is long gone, and only
656          * the tcp_timewait_sock remains. So the md5 key is stashed in that
657          * structure, and we use it in preference.  I believe that (twsk ||
658          * skb->sk) holds true, but we program defensively.
659          */
660         if (!twsk && skb->sk) {
661                 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
662         } else if (twsk && twsk->tw_md5_keylen) {
663                 tw_key.key = twsk->tw_md5_key;
664                 tw_key.keylen = twsk->tw_md5_keylen;
665                 key = &tw_key;
666         } else
667                 key = NULL;
668
669         if (key) {
670                 int offset = (ts) ? 3 : 0;
671
672                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
673                                           (TCPOPT_NOP << 16) |
674                                           (TCPOPT_MD5SIG << 8) |
675                                           TCPOLEN_MD5SIG);
676                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
677                 rep.th.doff = arg.iov[0].iov_len/4;
678
679                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
680                                         key,
681                                         ip_hdr(skb)->daddr,
682                                         ip_hdr(skb)->saddr,
683                                         &rep.th, IPPROTO_TCP,
684                                         arg.iov[0].iov_len);
685         }
686 #endif
687         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
688                                       ip_hdr(skb)->saddr, /* XXX */
689                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
690         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
691         if (twsk)
692                 arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if;
693
694         ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb,
695                       &arg, arg.iov[0].iov_len);
696
697         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
698 }
699
700 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
701 {
702         struct inet_timewait_sock *tw = inet_twsk(sk);
703         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
704
705         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
706                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
707                         tcptw->tw_ts_recent);
708
709         inet_twsk_put(tw);
710 }
711
712 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
713                                   struct request_sock *req)
714 {
715         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
716                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
717                         req->ts_recent);
718 }
719
720 /*
721  *      Send a SYN-ACK after having received a SYN.
722  *      This still operates on a request_sock only, not on a big
723  *      socket.
724  */
725 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
726                                 struct dst_entry *dst)
727 {
728         const struct inet_request_sock *ireq = inet_rsk(req);
729         int err = -1;
730         struct sk_buff * skb;
731
732         /* First, grab a route. */
733         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
734                 return -1;
735
736         skb = tcp_make_synack(sk, dst, req);
737
738         if (skb) {
739                 struct tcphdr *th = tcp_hdr(skb);
740
741                 th->check = tcp_v4_check(skb->len,
742                                          ireq->loc_addr,
743                                          ireq->rmt_addr,
744                                          csum_partial((char *)th, skb->len,
745                                                       skb->csum));
746
747                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
748                                             ireq->rmt_addr,
749                                             ireq->opt);
750                 err = net_xmit_eval(err);
751         }
752
753         dst_release(dst);
754         return err;
755 }
756
757 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
758 {
759         return __tcp_v4_send_synack(sk, req, NULL);
760 }
761
762 /*
763  *      IPv4 request_sock destructor.
764  */
765 static void tcp_v4_reqsk_destructor(struct request_sock *req)
766 {
767         kfree(inet_rsk(req)->opt);
768 }
769
770 #ifdef CONFIG_SYN_COOKIES
771 static void syn_flood_warning(struct sk_buff *skb)
772 {
773         static unsigned long warntime;
774
775         if (time_after(jiffies, (warntime + HZ * 60))) {
776                 warntime = jiffies;
777                 printk(KERN_INFO
778                        "possible SYN flooding on port %d. Sending cookies.\n",
779                        ntohs(tcp_hdr(skb)->dest));
780         }
781 }
782 #endif
783
784 /*
785  * Save and compile IPv4 options into the request_sock if needed.
786  */
787 static struct ip_options *tcp_v4_save_options(struct sock *sk,
788                                               struct sk_buff *skb)
789 {
790         struct ip_options *opt = &(IPCB(skb)->opt);
791         struct ip_options *dopt = NULL;
792
793         if (opt && opt->optlen) {
794                 int opt_size = optlength(opt);
795                 dopt = kmalloc(opt_size, GFP_ATOMIC);
796                 if (dopt) {
797                         if (ip_options_echo(dopt, skb)) {
798                                 kfree(dopt);
799                                 dopt = NULL;
800                         }
801                 }
802         }
803         return dopt;
804 }
805
806 #ifdef CONFIG_TCP_MD5SIG
807 /*
808  * RFC2385 MD5 checksumming requires a mapping of
809  * IP address->MD5 Key.
810  * We need to maintain these in the sk structure.
811  */
812
813 /* Find the Key structure for an address.  */
814 static struct tcp_md5sig_key *
815                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
816 {
817         struct tcp_sock *tp = tcp_sk(sk);
818         int i;
819
820         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
821                 return NULL;
822         for (i = 0; i < tp->md5sig_info->entries4; i++) {
823                 if (tp->md5sig_info->keys4[i].addr == addr)
824                         return &tp->md5sig_info->keys4[i].base;
825         }
826         return NULL;
827 }
828
829 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
830                                          struct sock *addr_sk)
831 {
832         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
833 }
834
835 EXPORT_SYMBOL(tcp_v4_md5_lookup);
836
837 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
838                                                       struct request_sock *req)
839 {
840         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
841 }
842
843 /* This can be called on a newly created socket, from other files */
844 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
845                       u8 *newkey, u8 newkeylen)
846 {
847         /* Add Key to the list */
848         struct tcp_md5sig_key *key;
849         struct tcp_sock *tp = tcp_sk(sk);
850         struct tcp4_md5sig_key *keys;
851
852         key = tcp_v4_md5_do_lookup(sk, addr);
853         if (key) {
854                 /* Pre-existing entry - just update that one. */
855                 kfree(key->key);
856                 key->key = newkey;
857                 key->keylen = newkeylen;
858         } else {
859                 struct tcp_md5sig_info *md5sig;
860
861                 if (!tp->md5sig_info) {
862                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
863                                                   GFP_ATOMIC);
864                         if (!tp->md5sig_info) {
865                                 kfree(newkey);
866                                 return -ENOMEM;
867                         }
868                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
869                 }
870                 if (tcp_alloc_md5sig_pool() == NULL) {
871                         kfree(newkey);
872                         return -ENOMEM;
873                 }
874                 md5sig = tp->md5sig_info;
875
876                 if (md5sig->alloced4 == md5sig->entries4) {
877                         keys = kmalloc((sizeof(*keys) *
878                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
879                         if (!keys) {
880                                 kfree(newkey);
881                                 tcp_free_md5sig_pool();
882                                 return -ENOMEM;
883                         }
884
885                         if (md5sig->entries4)
886                                 memcpy(keys, md5sig->keys4,
887                                        sizeof(*keys) * md5sig->entries4);
888
889                         /* Free old key list, and reference new one */
890                         kfree(md5sig->keys4);
891                         md5sig->keys4 = keys;
892                         md5sig->alloced4++;
893                 }
894                 md5sig->entries4++;
895                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
896                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
897                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
898         }
899         return 0;
900 }
901
902 EXPORT_SYMBOL(tcp_v4_md5_do_add);
903
904 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
905                                u8 *newkey, u8 newkeylen)
906 {
907         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
908                                  newkey, newkeylen);
909 }
910
911 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
912 {
913         struct tcp_sock *tp = tcp_sk(sk);
914         int i;
915
916         for (i = 0; i < tp->md5sig_info->entries4; i++) {
917                 if (tp->md5sig_info->keys4[i].addr == addr) {
918                         /* Free the key */
919                         kfree(tp->md5sig_info->keys4[i].base.key);
920                         tp->md5sig_info->entries4--;
921
922                         if (tp->md5sig_info->entries4 == 0) {
923                                 kfree(tp->md5sig_info->keys4);
924                                 tp->md5sig_info->keys4 = NULL;
925                                 tp->md5sig_info->alloced4 = 0;
926                         } else if (tp->md5sig_info->entries4 != i) {
927                                 /* Need to do some manipulation */
928                                 memmove(&tp->md5sig_info->keys4[i],
929                                         &tp->md5sig_info->keys4[i+1],
930                                         (tp->md5sig_info->entries4 - i) *
931                                          sizeof(struct tcp4_md5sig_key));
932                         }
933                         tcp_free_md5sig_pool();
934                         return 0;
935                 }
936         }
937         return -ENOENT;
938 }
939
940 EXPORT_SYMBOL(tcp_v4_md5_do_del);
941
942 static void tcp_v4_clear_md5_list(struct sock *sk)
943 {
944         struct tcp_sock *tp = tcp_sk(sk);
945
946         /* Free each key, then the set of key keys,
947          * the crypto element, and then decrement our
948          * hold on the last resort crypto.
949          */
950         if (tp->md5sig_info->entries4) {
951                 int i;
952                 for (i = 0; i < tp->md5sig_info->entries4; i++)
953                         kfree(tp->md5sig_info->keys4[i].base.key);
954                 tp->md5sig_info->entries4 = 0;
955                 tcp_free_md5sig_pool();
956         }
957         if (tp->md5sig_info->keys4) {
958                 kfree(tp->md5sig_info->keys4);
959                 tp->md5sig_info->keys4 = NULL;
960                 tp->md5sig_info->alloced4  = 0;
961         }
962 }
963
964 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
965                                  int optlen)
966 {
967         struct tcp_md5sig cmd;
968         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
969         u8 *newkey;
970
971         if (optlen < sizeof(cmd))
972                 return -EINVAL;
973
974         if (copy_from_user(&cmd, optval, sizeof(cmd)))
975                 return -EFAULT;
976
977         if (sin->sin_family != AF_INET)
978                 return -EINVAL;
979
980         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
981                 if (!tcp_sk(sk)->md5sig_info)
982                         return -ENOENT;
983                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
984         }
985
986         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
987                 return -EINVAL;
988
989         if (!tcp_sk(sk)->md5sig_info) {
990                 struct tcp_sock *tp = tcp_sk(sk);
991                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
992
993                 if (!p)
994                         return -EINVAL;
995
996                 tp->md5sig_info = p;
997                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
998         }
999
1000         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1001         if (!newkey)
1002                 return -ENOMEM;
1003         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1004                                  newkey, cmd.tcpm_keylen);
1005 }
1006
1007 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1008                                    __be32 saddr, __be32 daddr,
1009                                    struct tcphdr *th, int protocol,
1010                                    unsigned int tcplen)
1011 {
1012         struct scatterlist sg[4];
1013         __u16 data_len;
1014         int block = 0;
1015         __sum16 old_checksum;
1016         struct tcp_md5sig_pool *hp;
1017         struct tcp4_pseudohdr *bp;
1018         struct hash_desc *desc;
1019         int err;
1020         unsigned int nbytes = 0;
1021
1022         /*
1023          * Okay, so RFC2385 is turned on for this connection,
1024          * so we need to generate the MD5 hash for the packet now.
1025          */
1026
1027         hp = tcp_get_md5sig_pool();
1028         if (!hp)
1029                 goto clear_hash_noput;
1030
1031         bp = &hp->md5_blk.ip4;
1032         desc = &hp->md5_desc;
1033
1034         /*
1035          * 1. the TCP pseudo-header (in the order: source IP address,
1036          * destination IP address, zero-padded protocol number, and
1037          * segment length)
1038          */
1039         bp->saddr = saddr;
1040         bp->daddr = daddr;
1041         bp->pad = 0;
1042         bp->protocol = protocol;
1043         bp->len = htons(tcplen);
1044
1045         sg_init_table(sg, 4);
1046
1047         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1048         nbytes += sizeof(*bp);
1049
1050         /* 2. the TCP header, excluding options, and assuming a
1051          * checksum of zero/
1052          */
1053         old_checksum = th->check;
1054         th->check = 0;
1055         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1056         nbytes += sizeof(struct tcphdr);
1057
1058         /* 3. the TCP segment data (if any) */
1059         data_len = tcplen - (th->doff << 2);
1060         if (data_len > 0) {
1061                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1062                 sg_set_buf(&sg[block++], data, data_len);
1063                 nbytes += data_len;
1064         }
1065
1066         /* 4. an independently-specified key or password, known to both
1067          * TCPs and presumably connection-specific
1068          */
1069         sg_set_buf(&sg[block++], key->key, key->keylen);
1070         nbytes += key->keylen;
1071
1072         sg_mark_end(&sg[block - 1]);
1073
1074         /* Now store the Hash into the packet */
1075         err = crypto_hash_init(desc);
1076         if (err)
1077                 goto clear_hash;
1078         err = crypto_hash_update(desc, sg, nbytes);
1079         if (err)
1080                 goto clear_hash;
1081         err = crypto_hash_final(desc, md5_hash);
1082         if (err)
1083                 goto clear_hash;
1084
1085         /* Reset header, and free up the crypto */
1086         tcp_put_md5sig_pool();
1087         th->check = old_checksum;
1088
1089 out:
1090         return 0;
1091 clear_hash:
1092         tcp_put_md5sig_pool();
1093 clear_hash_noput:
1094         memset(md5_hash, 0, 16);
1095         goto out;
1096 }
1097
1098 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1099                          struct sock *sk,
1100                          struct dst_entry *dst,
1101                          struct request_sock *req,
1102                          struct tcphdr *th, int protocol,
1103                          unsigned int tcplen)
1104 {
1105         __be32 saddr, daddr;
1106
1107         if (sk) {
1108                 saddr = inet_sk(sk)->saddr;
1109                 daddr = inet_sk(sk)->daddr;
1110         } else {
1111                 struct rtable *rt = (struct rtable *)dst;
1112                 BUG_ON(!rt);
1113                 saddr = rt->rt_src;
1114                 daddr = rt->rt_dst;
1115         }
1116         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1117                                        saddr, daddr,
1118                                        th, protocol, tcplen);
1119 }
1120
1121 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1122
1123 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1124 {
1125         /*
1126          * This gets called for each TCP segment that arrives
1127          * so we want to be efficient.
1128          * We have 3 drop cases:
1129          * o No MD5 hash and one expected.
1130          * o MD5 hash and we're not expecting one.
1131          * o MD5 hash and its wrong.
1132          */
1133         __u8 *hash_location = NULL;
1134         struct tcp_md5sig_key *hash_expected;
1135         const struct iphdr *iph = ip_hdr(skb);
1136         struct tcphdr *th = tcp_hdr(skb);
1137         int genhash;
1138         unsigned char newhash[16];
1139
1140         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1141         hash_location = tcp_parse_md5sig_option(th);
1142
1143         /* We've parsed the options - do we have a hash? */
1144         if (!hash_expected && !hash_location)
1145                 return 0;
1146
1147         if (hash_expected && !hash_location) {
1148                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1149                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1150                                NIPQUAD(iph->saddr), ntohs(th->source),
1151                                NIPQUAD(iph->daddr), ntohs(th->dest));
1152                 return 1;
1153         }
1154
1155         if (!hash_expected && hash_location) {
1156                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1157                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1158                                NIPQUAD(iph->saddr), ntohs(th->source),
1159                                NIPQUAD(iph->daddr), ntohs(th->dest));
1160                 return 1;
1161         }
1162
1163         /* Okay, so this is hash_expected and hash_location -
1164          * so we need to calculate the checksum.
1165          */
1166         genhash = tcp_v4_do_calc_md5_hash(newhash,
1167                                           hash_expected,
1168                                           iph->saddr, iph->daddr,
1169                                           th, sk->sk_protocol,
1170                                           skb->len);
1171
1172         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1173                 if (net_ratelimit()) {
1174                         printk(KERN_INFO "MD5 Hash failed for "
1175                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1176                                NIPQUAD(iph->saddr), ntohs(th->source),
1177                                NIPQUAD(iph->daddr), ntohs(th->dest),
1178                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1179                 }
1180                 return 1;
1181         }
1182         return 0;
1183 }
1184
1185 #endif
1186
1187 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1188         .family         =       PF_INET,
1189         .obj_size       =       sizeof(struct tcp_request_sock),
1190         .rtx_syn_ack    =       tcp_v4_send_synack,
1191         .send_ack       =       tcp_v4_reqsk_send_ack,
1192         .destructor     =       tcp_v4_reqsk_destructor,
1193         .send_reset     =       tcp_v4_send_reset,
1194 };
1195
1196 #ifdef CONFIG_TCP_MD5SIG
1197 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1198         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1199 };
1200 #endif
1201
1202 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1203         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1204         .twsk_unique    = tcp_twsk_unique,
1205         .twsk_destructor= tcp_twsk_destructor,
1206 };
1207
1208 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1209 {
1210         struct inet_request_sock *ireq;
1211         struct tcp_options_received tmp_opt;
1212         struct request_sock *req;
1213         __be32 saddr = ip_hdr(skb)->saddr;
1214         __be32 daddr = ip_hdr(skb)->daddr;
1215         __u32 isn = TCP_SKB_CB(skb)->when;
1216         struct dst_entry *dst = NULL;
1217 #ifdef CONFIG_SYN_COOKIES
1218         int want_cookie = 0;
1219 #else
1220 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1221 #endif
1222
1223         /* Never answer to SYNs send to broadcast or multicast */
1224         if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1225                 goto drop;
1226
1227         /* TW buckets are converted to open requests without
1228          * limitations, they conserve resources and peer is
1229          * evidently real one.
1230          */
1231         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1232 #ifdef CONFIG_SYN_COOKIES
1233                 if (sysctl_tcp_syncookies) {
1234                         want_cookie = 1;
1235                 } else
1236 #endif
1237                 goto drop;
1238         }
1239
1240         /* Accept backlog is full. If we have already queued enough
1241          * of warm entries in syn queue, drop request. It is better than
1242          * clogging syn queue with openreqs with exponentially increasing
1243          * timeout.
1244          */
1245         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1246                 goto drop;
1247
1248         req = reqsk_alloc(&tcp_request_sock_ops);
1249         if (!req)
1250                 goto drop;
1251
1252 #ifdef CONFIG_TCP_MD5SIG
1253         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1254 #endif
1255
1256         tcp_clear_options(&tmp_opt);
1257         tmp_opt.mss_clamp = 536;
1258         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1259
1260         tcp_parse_options(skb, &tmp_opt, 0);
1261
1262         if (want_cookie && !tmp_opt.saw_tstamp)
1263                 tcp_clear_options(&tmp_opt);
1264
1265         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1266                 /* Some OSes (unknown ones, but I see them on web server, which
1267                  * contains information interesting only for windows'
1268                  * users) do not send their stamp in SYN. It is easy case.
1269                  * We simply do not advertise TS support.
1270                  */
1271                 tmp_opt.saw_tstamp = 0;
1272                 tmp_opt.tstamp_ok  = 0;
1273         }
1274         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1275
1276         tcp_openreq_init(req, &tmp_opt, skb);
1277
1278         if (security_inet_conn_request(sk, skb, req))
1279                 goto drop_and_free;
1280
1281         ireq = inet_rsk(req);
1282         ireq->loc_addr = daddr;
1283         ireq->rmt_addr = saddr;
1284         ireq->opt = tcp_v4_save_options(sk, skb);
1285         if (!want_cookie)
1286                 TCP_ECN_create_request(req, tcp_hdr(skb));
1287
1288         if (want_cookie) {
1289 #ifdef CONFIG_SYN_COOKIES
1290                 syn_flood_warning(skb);
1291                 req->cookie_ts = tmp_opt.tstamp_ok;
1292 #endif
1293                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1294         } else if (!isn) {
1295                 struct inet_peer *peer = NULL;
1296
1297                 /* VJ's idea. We save last timestamp seen
1298                  * from the destination in peer table, when entering
1299                  * state TIME-WAIT, and check against it before
1300                  * accepting new connection request.
1301                  *
1302                  * If "isn" is not zero, this request hit alive
1303                  * timewait bucket, so that all the necessary checks
1304                  * are made in the function processing timewait state.
1305                  */
1306                 if (tmp_opt.saw_tstamp &&
1307                     tcp_death_row.sysctl_tw_recycle &&
1308                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1309                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1310                     peer->v4daddr == saddr) {
1311                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1312                             (s32)(peer->tcp_ts - req->ts_recent) >
1313                                                         TCP_PAWS_WINDOW) {
1314                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1315                                 goto drop_and_release;
1316                         }
1317                 }
1318                 /* Kill the following clause, if you dislike this way. */
1319                 else if (!sysctl_tcp_syncookies &&
1320                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1321                           (sysctl_max_syn_backlog >> 2)) &&
1322                          (!peer || !peer->tcp_ts_stamp) &&
1323                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1324                         /* Without syncookies last quarter of
1325                          * backlog is filled with destinations,
1326                          * proven to be alive.
1327                          * It means that we continue to communicate
1328                          * to destinations, already remembered
1329                          * to the moment of synflood.
1330                          */
1331                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1332                                        "request from " NIPQUAD_FMT "/%u\n",
1333                                        NIPQUAD(saddr),
1334                                        ntohs(tcp_hdr(skb)->source));
1335                         goto drop_and_release;
1336                 }
1337
1338                 isn = tcp_v4_init_sequence(skb);
1339         }
1340         tcp_rsk(req)->snt_isn = isn;
1341
1342         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1343                 goto drop_and_free;
1344
1345         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1346         return 0;
1347
1348 drop_and_release:
1349         dst_release(dst);
1350 drop_and_free:
1351         reqsk_free(req);
1352 drop:
1353         return 0;
1354 }
1355
1356
1357 /*
1358  * The three way handshake has completed - we got a valid synack -
1359  * now create the new socket.
1360  */
1361 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1362                                   struct request_sock *req,
1363                                   struct dst_entry *dst)
1364 {
1365         struct inet_request_sock *ireq;
1366         struct inet_sock *newinet;
1367         struct tcp_sock *newtp;
1368         struct sock *newsk;
1369 #ifdef CONFIG_TCP_MD5SIG
1370         struct tcp_md5sig_key *key;
1371 #endif
1372
1373         if (sk_acceptq_is_full(sk))
1374                 goto exit_overflow;
1375
1376         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1377                 goto exit;
1378
1379         newsk = tcp_create_openreq_child(sk, req, skb);
1380         if (!newsk)
1381                 goto exit;
1382
1383         newsk->sk_gso_type = SKB_GSO_TCPV4;
1384         sk_setup_caps(newsk, dst);
1385
1386         newtp                 = tcp_sk(newsk);
1387         newinet               = inet_sk(newsk);
1388         ireq                  = inet_rsk(req);
1389         newinet->daddr        = ireq->rmt_addr;
1390         newinet->rcv_saddr    = ireq->loc_addr;
1391         newinet->saddr        = ireq->loc_addr;
1392         newinet->opt          = ireq->opt;
1393         ireq->opt             = NULL;
1394         newinet->mc_index     = inet_iif(skb);
1395         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1396         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1397         if (newinet->opt)
1398                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1399         newinet->id = newtp->write_seq ^ jiffies;
1400
1401         tcp_mtup_init(newsk);
1402         tcp_sync_mss(newsk, dst_mtu(dst));
1403         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1404         tcp_initialize_rcv_mss(newsk);
1405
1406 #ifdef CONFIG_TCP_MD5SIG
1407         /* Copy over the MD5 key from the original socket */
1408         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1409                 /*
1410                  * We're using one, so create a matching key
1411                  * on the newsk structure. If we fail to get
1412                  * memory, then we end up not copying the key
1413                  * across. Shucks.
1414                  */
1415                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1416                 if (newkey != NULL)
1417                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1418                                           newkey, key->keylen);
1419         }
1420 #endif
1421
1422         __inet_hash_nolisten(newsk);
1423         __inet_inherit_port(sk, newsk);
1424
1425         return newsk;
1426
1427 exit_overflow:
1428         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1429 exit:
1430         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1431         dst_release(dst);
1432         return NULL;
1433 }
1434
1435 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1436 {
1437         struct tcphdr *th = tcp_hdr(skb);
1438         const struct iphdr *iph = ip_hdr(skb);
1439         struct sock *nsk;
1440         struct request_sock **prev;
1441         /* Find possible connection requests. */
1442         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1443                                                        iph->saddr, iph->daddr);
1444         if (req)
1445                 return tcp_check_req(sk, skb, req, prev);
1446
1447         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1448                         th->source, iph->daddr, th->dest, inet_iif(skb));
1449
1450         if (nsk) {
1451                 if (nsk->sk_state != TCP_TIME_WAIT) {
1452                         bh_lock_sock(nsk);
1453                         return nsk;
1454                 }
1455                 inet_twsk_put(inet_twsk(nsk));
1456                 return NULL;
1457         }
1458
1459 #ifdef CONFIG_SYN_COOKIES
1460         if (!th->rst && !th->syn && th->ack)
1461                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1462 #endif
1463         return sk;
1464 }
1465
1466 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1467 {
1468         const struct iphdr *iph = ip_hdr(skb);
1469
1470         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1471                 if (!tcp_v4_check(skb->len, iph->saddr,
1472                                   iph->daddr, skb->csum)) {
1473                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1474                         return 0;
1475                 }
1476         }
1477
1478         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1479                                        skb->len, IPPROTO_TCP, 0);
1480
1481         if (skb->len <= 76) {
1482                 return __skb_checksum_complete(skb);
1483         }
1484         return 0;
1485 }
1486
1487
1488 /* The socket must have it's spinlock held when we get
1489  * here.
1490  *
1491  * We have a potential double-lock case here, so even when
1492  * doing backlog processing we use the BH locking scheme.
1493  * This is because we cannot sleep with the original spinlock
1494  * held.
1495  */
1496 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1497 {
1498         struct sock *rsk;
1499 #ifdef CONFIG_TCP_MD5SIG
1500         /*
1501          * We really want to reject the packet as early as possible
1502          * if:
1503          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1504          *  o There is an MD5 option and we're not expecting one
1505          */
1506         if (tcp_v4_inbound_md5_hash(sk, skb))
1507                 goto discard;
1508 #endif
1509
1510         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1511                 TCP_CHECK_TIMER(sk);
1512                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1513                         rsk = sk;
1514                         goto reset;
1515                 }
1516                 TCP_CHECK_TIMER(sk);
1517                 return 0;
1518         }
1519
1520         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1521                 goto csum_err;
1522
1523         if (sk->sk_state == TCP_LISTEN) {
1524                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1525                 if (!nsk)
1526                         goto discard;
1527
1528                 if (nsk != sk) {
1529                         if (tcp_child_process(sk, nsk, skb)) {
1530                                 rsk = nsk;
1531                                 goto reset;
1532                         }
1533                         return 0;
1534                 }
1535         }
1536
1537         TCP_CHECK_TIMER(sk);
1538         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1539                 rsk = sk;
1540                 goto reset;
1541         }
1542         TCP_CHECK_TIMER(sk);
1543         return 0;
1544
1545 reset:
1546         tcp_v4_send_reset(rsk, skb);
1547 discard:
1548         kfree_skb(skb);
1549         /* Be careful here. If this function gets more complicated and
1550          * gcc suffers from register pressure on the x86, sk (in %ebx)
1551          * might be destroyed here. This current version compiles correctly,
1552          * but you have been warned.
1553          */
1554         return 0;
1555
1556 csum_err:
1557         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1558         goto discard;
1559 }
1560
1561 /*
1562  *      From tcp_input.c
1563  */
1564
1565 int tcp_v4_rcv(struct sk_buff *skb)
1566 {
1567         const struct iphdr *iph;
1568         struct tcphdr *th;
1569         struct sock *sk;
1570         int ret;
1571
1572         if (skb->pkt_type != PACKET_HOST)
1573                 goto discard_it;
1574
1575         /* Count it even if it's bad */
1576         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1577
1578         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1579                 goto discard_it;
1580
1581         th = tcp_hdr(skb);
1582
1583         if (th->doff < sizeof(struct tcphdr) / 4)
1584                 goto bad_packet;
1585         if (!pskb_may_pull(skb, th->doff * 4))
1586                 goto discard_it;
1587
1588         /* An explanation is required here, I think.
1589          * Packet length and doff are validated by header prediction,
1590          * provided case of th->doff==0 is eliminated.
1591          * So, we defer the checks. */
1592         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1593                 goto bad_packet;
1594
1595         th = tcp_hdr(skb);
1596         iph = ip_hdr(skb);
1597         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1598         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1599                                     skb->len - th->doff * 4);
1600         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1601         TCP_SKB_CB(skb)->when    = 0;
1602         TCP_SKB_CB(skb)->flags   = iph->tos;
1603         TCP_SKB_CB(skb)->sacked  = 0;
1604
1605         sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr,
1606                         th->source, iph->daddr, th->dest, inet_iif(skb));
1607         if (!sk)
1608                 goto no_tcp_socket;
1609
1610 process:
1611         if (sk->sk_state == TCP_TIME_WAIT)
1612                 goto do_time_wait;
1613
1614         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1615                 goto discard_and_relse;
1616         nf_reset(skb);
1617
1618         if (sk_filter(sk, skb))
1619                 goto discard_and_relse;
1620
1621         skb->dev = NULL;
1622
1623         bh_lock_sock_nested(sk);
1624         ret = 0;
1625         if (!sock_owned_by_user(sk)) {
1626 #ifdef CONFIG_NET_DMA
1627                 struct tcp_sock *tp = tcp_sk(sk);
1628                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1629                         tp->ucopy.dma_chan = get_softnet_dma();
1630                 if (tp->ucopy.dma_chan)
1631                         ret = tcp_v4_do_rcv(sk, skb);
1632                 else
1633 #endif
1634                 {
1635                         if (!tcp_prequeue(sk, skb))
1636                         ret = tcp_v4_do_rcv(sk, skb);
1637                 }
1638         } else
1639                 sk_add_backlog(sk, skb);
1640         bh_unlock_sock(sk);
1641
1642         sock_put(sk);
1643
1644         return ret;
1645
1646 no_tcp_socket:
1647         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1648                 goto discard_it;
1649
1650         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1651 bad_packet:
1652                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1653         } else {
1654                 tcp_v4_send_reset(NULL, skb);
1655         }
1656
1657 discard_it:
1658         /* Discard frame. */
1659         kfree_skb(skb);
1660         return 0;
1661
1662 discard_and_relse:
1663         sock_put(sk);
1664         goto discard_it;
1665
1666 do_time_wait:
1667         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1668                 inet_twsk_put(inet_twsk(sk));
1669                 goto discard_it;
1670         }
1671
1672         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1673                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1674                 inet_twsk_put(inet_twsk(sk));
1675                 goto discard_it;
1676         }
1677         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1678         case TCP_TW_SYN: {
1679                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1680                                                         &tcp_hashinfo,
1681                                                         iph->daddr, th->dest,
1682                                                         inet_iif(skb));
1683                 if (sk2) {
1684                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1685                         inet_twsk_put(inet_twsk(sk));
1686                         sk = sk2;
1687                         goto process;
1688                 }
1689                 /* Fall through to ACK */
1690         }
1691         case TCP_TW_ACK:
1692                 tcp_v4_timewait_ack(sk, skb);
1693                 break;
1694         case TCP_TW_RST:
1695                 goto no_tcp_socket;
1696         case TCP_TW_SUCCESS:;
1697         }
1698         goto discard_it;
1699 }
1700
1701 /* VJ's idea. Save last timestamp seen from this destination
1702  * and hold it at least for normal timewait interval to use for duplicate
1703  * segment detection in subsequent connections, before they enter synchronized
1704  * state.
1705  */
1706
1707 int tcp_v4_remember_stamp(struct sock *sk)
1708 {
1709         struct inet_sock *inet = inet_sk(sk);
1710         struct tcp_sock *tp = tcp_sk(sk);
1711         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1712         struct inet_peer *peer = NULL;
1713         int release_it = 0;
1714
1715         if (!rt || rt->rt_dst != inet->daddr) {
1716                 peer = inet_getpeer(inet->daddr, 1);
1717                 release_it = 1;
1718         } else {
1719                 if (!rt->peer)
1720                         rt_bind_peer(rt, 1);
1721                 peer = rt->peer;
1722         }
1723
1724         if (peer) {
1725                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1726                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1727                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1728                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1729                         peer->tcp_ts = tp->rx_opt.ts_recent;
1730                 }
1731                 if (release_it)
1732                         inet_putpeer(peer);
1733                 return 1;
1734         }
1735
1736         return 0;
1737 }
1738
1739 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1740 {
1741         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1742
1743         if (peer) {
1744                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1745
1746                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1747                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1748                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1749                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1750                         peer->tcp_ts       = tcptw->tw_ts_recent;
1751                 }
1752                 inet_putpeer(peer);
1753                 return 1;
1754         }
1755
1756         return 0;
1757 }
1758
1759 struct inet_connection_sock_af_ops ipv4_specific = {
1760         .queue_xmit        = ip_queue_xmit,
1761         .send_check        = tcp_v4_send_check,
1762         .rebuild_header    = inet_sk_rebuild_header,
1763         .conn_request      = tcp_v4_conn_request,
1764         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1765         .remember_stamp    = tcp_v4_remember_stamp,
1766         .net_header_len    = sizeof(struct iphdr),
1767         .setsockopt        = ip_setsockopt,
1768         .getsockopt        = ip_getsockopt,
1769         .addr2sockaddr     = inet_csk_addr2sockaddr,
1770         .sockaddr_len      = sizeof(struct sockaddr_in),
1771         .bind_conflict     = inet_csk_bind_conflict,
1772 #ifdef CONFIG_COMPAT
1773         .compat_setsockopt = compat_ip_setsockopt,
1774         .compat_getsockopt = compat_ip_getsockopt,
1775 #endif
1776 };
1777
1778 #ifdef CONFIG_TCP_MD5SIG
1779 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1780         .md5_lookup             = tcp_v4_md5_lookup,
1781         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1782         .md5_add                = tcp_v4_md5_add_func,
1783         .md5_parse              = tcp_v4_parse_md5_keys,
1784 };
1785 #endif
1786
1787 /* NOTE: A lot of things set to zero explicitly by call to
1788  *       sk_alloc() so need not be done here.
1789  */
1790 static int tcp_v4_init_sock(struct sock *sk)
1791 {
1792         struct inet_connection_sock *icsk = inet_csk(sk);
1793         struct tcp_sock *tp = tcp_sk(sk);
1794
1795         skb_queue_head_init(&tp->out_of_order_queue);
1796         tcp_init_xmit_timers(sk);
1797         tcp_prequeue_init(tp);
1798
1799         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1800         tp->mdev = TCP_TIMEOUT_INIT;
1801
1802         /* So many TCP implementations out there (incorrectly) count the
1803          * initial SYN frame in their delayed-ACK and congestion control
1804          * algorithms that we must have the following bandaid to talk
1805          * efficiently to them.  -DaveM
1806          */
1807         tp->snd_cwnd = 2;
1808
1809         /* See draft-stevens-tcpca-spec-01 for discussion of the
1810          * initialization of these values.
1811          */
1812         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1813         tp->snd_cwnd_clamp = ~0;
1814         tp->mss_cache = 536;
1815
1816         tp->reordering = sysctl_tcp_reordering;
1817         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1818
1819         sk->sk_state = TCP_CLOSE;
1820
1821         sk->sk_write_space = sk_stream_write_space;
1822         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1823
1824         icsk->icsk_af_ops = &ipv4_specific;
1825         icsk->icsk_sync_mss = tcp_sync_mss;
1826 #ifdef CONFIG_TCP_MD5SIG
1827         tp->af_specific = &tcp_sock_ipv4_specific;
1828 #endif
1829
1830         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1831         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1832
1833         atomic_inc(&tcp_sockets_allocated);
1834
1835         return 0;
1836 }
1837
1838 int tcp_v4_destroy_sock(struct sock *sk)
1839 {
1840         struct tcp_sock *tp = tcp_sk(sk);
1841
1842         tcp_clear_xmit_timers(sk);
1843
1844         tcp_cleanup_congestion_control(sk);
1845
1846         /* Cleanup up the write buffer. */
1847         tcp_write_queue_purge(sk);
1848
1849         /* Cleans up our, hopefully empty, out_of_order_queue. */
1850         __skb_queue_purge(&tp->out_of_order_queue);
1851
1852 #ifdef CONFIG_TCP_MD5SIG
1853         /* Clean up the MD5 key list, if any */
1854         if (tp->md5sig_info) {
1855                 tcp_v4_clear_md5_list(sk);
1856                 kfree(tp->md5sig_info);
1857                 tp->md5sig_info = NULL;
1858         }
1859 #endif
1860
1861 #ifdef CONFIG_NET_DMA
1862         /* Cleans up our sk_async_wait_queue */
1863         __skb_queue_purge(&sk->sk_async_wait_queue);
1864 #endif
1865
1866         /* Clean prequeue, it must be empty really */
1867         __skb_queue_purge(&tp->ucopy.prequeue);
1868
1869         /* Clean up a referenced TCP bind bucket. */
1870         if (inet_csk(sk)->icsk_bind_hash)
1871                 inet_put_port(sk);
1872
1873         /*
1874          * If sendmsg cached page exists, toss it.
1875          */
1876         if (sk->sk_sndmsg_page) {
1877                 __free_page(sk->sk_sndmsg_page);
1878                 sk->sk_sndmsg_page = NULL;
1879         }
1880
1881         if (tp->defer_tcp_accept.request) {
1882                 reqsk_free(tp->defer_tcp_accept.request);
1883                 sock_put(tp->defer_tcp_accept.listen_sk);
1884                 sock_put(sk);
1885                 tp->defer_tcp_accept.listen_sk = NULL;
1886                 tp->defer_tcp_accept.request = NULL;
1887         }
1888
1889         atomic_dec(&tcp_sockets_allocated);
1890
1891         return 0;
1892 }
1893
1894 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1895
1896 #ifdef CONFIG_PROC_FS
1897 /* Proc filesystem TCP sock list dumping. */
1898
1899 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1900 {
1901         return hlist_empty(head) ? NULL :
1902                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1903 }
1904
1905 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1906 {
1907         return tw->tw_node.next ?
1908                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1909 }
1910
1911 static void *listening_get_next(struct seq_file *seq, void *cur)
1912 {
1913         struct inet_connection_sock *icsk;
1914         struct hlist_node *node;
1915         struct sock *sk = cur;
1916         struct tcp_iter_state* st = seq->private;
1917         struct net *net = seq_file_net(seq);
1918
1919         if (!sk) {
1920                 st->bucket = 0;
1921                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1922                 goto get_sk;
1923         }
1924
1925         ++st->num;
1926
1927         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1928                 struct request_sock *req = cur;
1929
1930                 icsk = inet_csk(st->syn_wait_sk);
1931                 req = req->dl_next;
1932                 while (1) {
1933                         while (req) {
1934                                 if (req->rsk_ops->family == st->family &&
1935                                     net_eq(sock_net(req->sk), net)) {
1936                                         cur = req;
1937                                         goto out;
1938                                 }
1939                                 req = req->dl_next;
1940                         }
1941                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1942                                 break;
1943 get_req:
1944                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1945                 }
1946                 sk        = sk_next(st->syn_wait_sk);
1947                 st->state = TCP_SEQ_STATE_LISTENING;
1948                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1949         } else {
1950                 icsk = inet_csk(sk);
1951                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1952                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1953                         goto start_req;
1954                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1955                 sk = sk_next(sk);
1956         }
1957 get_sk:
1958         sk_for_each_from(sk, node) {
1959                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1960                         cur = sk;
1961                         goto out;
1962                 }
1963                 icsk = inet_csk(sk);
1964                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1965                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1966 start_req:
1967                         st->uid         = sock_i_uid(sk);
1968                         st->syn_wait_sk = sk;
1969                         st->state       = TCP_SEQ_STATE_OPENREQ;
1970                         st->sbucket     = 0;
1971                         goto get_req;
1972                 }
1973                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1974         }
1975         if (++st->bucket < INET_LHTABLE_SIZE) {
1976                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1977                 goto get_sk;
1978         }
1979         cur = NULL;
1980 out:
1981         return cur;
1982 }
1983
1984 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1985 {
1986         void *rc = listening_get_next(seq, NULL);
1987
1988         while (rc && *pos) {
1989                 rc = listening_get_next(seq, rc);
1990                 --*pos;
1991         }
1992         return rc;
1993 }
1994
1995 static void *established_get_first(struct seq_file *seq)
1996 {
1997         struct tcp_iter_state* st = seq->private;
1998         struct net *net = seq_file_net(seq);
1999         void *rc = NULL;
2000
2001         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2002                 struct sock *sk;
2003                 struct hlist_node *node;
2004                 struct inet_timewait_sock *tw;
2005                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2006
2007                 read_lock_bh(lock);
2008                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2009                         if (sk->sk_family != st->family ||
2010                             !net_eq(sock_net(sk), net)) {
2011                                 continue;
2012                         }
2013                         rc = sk;
2014                         goto out;
2015                 }
2016                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2017                 inet_twsk_for_each(tw, node,
2018                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2019                         if (tw->tw_family != st->family ||
2020                             !net_eq(twsk_net(tw), net)) {
2021                                 continue;
2022                         }
2023                         rc = tw;
2024                         goto out;
2025                 }
2026                 read_unlock_bh(lock);
2027                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2028         }
2029 out:
2030         return rc;
2031 }
2032
2033 static void *established_get_next(struct seq_file *seq, void *cur)
2034 {
2035         struct sock *sk = cur;
2036         struct inet_timewait_sock *tw;
2037         struct hlist_node *node;
2038         struct tcp_iter_state* st = seq->private;
2039         struct net *net = seq_file_net(seq);
2040
2041         ++st->num;
2042
2043         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2044                 tw = cur;
2045                 tw = tw_next(tw);
2046 get_tw:
2047                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2048                         tw = tw_next(tw);
2049                 }
2050                 if (tw) {
2051                         cur = tw;
2052                         goto out;
2053                 }
2054                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2055                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2056
2057                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2058                         read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2059                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2060                 } else {
2061                         cur = NULL;
2062                         goto out;
2063                 }
2064         } else
2065                 sk = sk_next(sk);
2066
2067         sk_for_each_from(sk, node) {
2068                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2069                         goto found;
2070         }
2071
2072         st->state = TCP_SEQ_STATE_TIME_WAIT;
2073         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2074         goto get_tw;
2075 found:
2076         cur = sk;
2077 out:
2078         return cur;
2079 }
2080
2081 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2082 {
2083         void *rc = established_get_first(seq);
2084
2085         while (rc && pos) {
2086                 rc = established_get_next(seq, rc);
2087                 --pos;
2088         }
2089         return rc;
2090 }
2091
2092 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2093 {
2094         void *rc;
2095         struct tcp_iter_state* st = seq->private;
2096
2097         inet_listen_lock(&tcp_hashinfo);
2098         st->state = TCP_SEQ_STATE_LISTENING;
2099         rc        = listening_get_idx(seq, &pos);
2100
2101         if (!rc) {
2102                 inet_listen_unlock(&tcp_hashinfo);
2103                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2104                 rc        = established_get_idx(seq, pos);
2105         }
2106
2107         return rc;
2108 }
2109
2110 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2111 {
2112         struct tcp_iter_state* st = seq->private;
2113         st->state = TCP_SEQ_STATE_LISTENING;
2114         st->num = 0;
2115         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2116 }
2117
2118 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2119 {
2120         void *rc = NULL;
2121         struct tcp_iter_state* st;
2122
2123         if (v == SEQ_START_TOKEN) {
2124                 rc = tcp_get_idx(seq, 0);
2125                 goto out;
2126         }
2127         st = seq->private;
2128
2129         switch (st->state) {
2130         case TCP_SEQ_STATE_OPENREQ:
2131         case TCP_SEQ_STATE_LISTENING:
2132                 rc = listening_get_next(seq, v);
2133                 if (!rc) {
2134                         inet_listen_unlock(&tcp_hashinfo);
2135                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2136                         rc        = established_get_first(seq);
2137                 }
2138                 break;
2139         case TCP_SEQ_STATE_ESTABLISHED:
2140         case TCP_SEQ_STATE_TIME_WAIT:
2141                 rc = established_get_next(seq, v);
2142                 break;
2143         }
2144 out:
2145         ++*pos;
2146         return rc;
2147 }
2148
2149 static void tcp_seq_stop(struct seq_file *seq, void *v)
2150 {
2151         struct tcp_iter_state* st = seq->private;
2152
2153         switch (st->state) {
2154         case TCP_SEQ_STATE_OPENREQ:
2155                 if (v) {
2156                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2157                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2158                 }
2159         case TCP_SEQ_STATE_LISTENING:
2160                 if (v != SEQ_START_TOKEN)
2161                         inet_listen_unlock(&tcp_hashinfo);
2162                 break;
2163         case TCP_SEQ_STATE_TIME_WAIT:
2164         case TCP_SEQ_STATE_ESTABLISHED:
2165                 if (v)
2166                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2167                 break;
2168         }
2169 }
2170
2171 static int tcp_seq_open(struct inode *inode, struct file *file)
2172 {
2173         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2174         struct tcp_iter_state *s;
2175         int err;
2176
2177         err = seq_open_net(inode, file, &afinfo->seq_ops,
2178                           sizeof(struct tcp_iter_state));
2179         if (err < 0)
2180                 return err;
2181
2182         s = ((struct seq_file *)file->private_data)->private;
2183         s->family               = afinfo->family;
2184         return 0;
2185 }
2186
2187 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2188 {
2189         int rc = 0;
2190         struct proc_dir_entry *p;
2191
2192         afinfo->seq_fops.open           = tcp_seq_open;
2193         afinfo->seq_fops.read           = seq_read;
2194         afinfo->seq_fops.llseek         = seq_lseek;
2195         afinfo->seq_fops.release        = seq_release_net;
2196
2197         afinfo->seq_ops.start           = tcp_seq_start;
2198         afinfo->seq_ops.next            = tcp_seq_next;
2199         afinfo->seq_ops.stop            = tcp_seq_stop;
2200
2201         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2202                              &afinfo->seq_fops, afinfo);
2203         if (!p)
2204                 rc = -ENOMEM;
2205         return rc;
2206 }
2207
2208 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2209 {
2210         proc_net_remove(net, afinfo->name);
2211 }
2212
2213 static void get_openreq4(struct sock *sk, struct request_sock *req,
2214                          struct seq_file *f, int i, int uid, int *len)
2215 {
2216         const struct inet_request_sock *ireq = inet_rsk(req);
2217         int ttd = req->expires - jiffies;
2218
2219         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2220                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2221                 i,
2222                 ireq->loc_addr,
2223                 ntohs(inet_sk(sk)->sport),
2224                 ireq->rmt_addr,
2225                 ntohs(ireq->rmt_port),
2226                 TCP_SYN_RECV,
2227                 0, 0, /* could print option size, but that is af dependent. */
2228                 1,    /* timers active (only the expire timer) */
2229                 jiffies_to_clock_t(ttd),
2230                 req->retrans,
2231                 uid,
2232                 0,  /* non standard timer */
2233                 0, /* open_requests have no inode */
2234                 atomic_read(&sk->sk_refcnt),
2235                 req,
2236                 len);
2237 }
2238
2239 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2240 {
2241         int timer_active;
2242         unsigned long timer_expires;
2243         struct tcp_sock *tp = tcp_sk(sk);
2244         const struct inet_connection_sock *icsk = inet_csk(sk);
2245         struct inet_sock *inet = inet_sk(sk);
2246         __be32 dest = inet->daddr;
2247         __be32 src = inet->rcv_saddr;
2248         __u16 destp = ntohs(inet->dport);
2249         __u16 srcp = ntohs(inet->sport);
2250
2251         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2252                 timer_active    = 1;
2253                 timer_expires   = icsk->icsk_timeout;
2254         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2255                 timer_active    = 4;
2256                 timer_expires   = icsk->icsk_timeout;
2257         } else if (timer_pending(&sk->sk_timer)) {
2258                 timer_active    = 2;
2259                 timer_expires   = sk->sk_timer.expires;
2260         } else {
2261                 timer_active    = 0;
2262                 timer_expires = jiffies;
2263         }
2264
2265         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2266                         "%08X %5d %8d %lu %d %p %u %u %u %u %d%n",
2267                 i, src, srcp, dest, destp, sk->sk_state,
2268                 tp->write_seq - tp->snd_una,
2269                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2270                                              (tp->rcv_nxt - tp->copied_seq),
2271                 timer_active,
2272                 jiffies_to_clock_t(timer_expires - jiffies),
2273                 icsk->icsk_retransmits,
2274                 sock_i_uid(sk),
2275                 icsk->icsk_probes_out,
2276                 sock_i_ino(sk),
2277                 atomic_read(&sk->sk_refcnt), sk,
2278                 icsk->icsk_rto,
2279                 icsk->icsk_ack.ato,
2280                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2281                 tp->snd_cwnd,
2282                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2283                 len);
2284 }
2285
2286 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2287                                struct seq_file *f, int i, int *len)
2288 {
2289         __be32 dest, src;
2290         __u16 destp, srcp;
2291         int ttd = tw->tw_ttd - jiffies;
2292
2293         if (ttd < 0)
2294                 ttd = 0;
2295
2296         dest  = tw->tw_daddr;
2297         src   = tw->tw_rcv_saddr;
2298         destp = ntohs(tw->tw_dport);
2299         srcp  = ntohs(tw->tw_sport);
2300
2301         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2302                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2303                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2304                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2305                 atomic_read(&tw->tw_refcnt), tw, len);
2306 }
2307
2308 #define TMPSZ 150
2309
2310 static int tcp4_seq_show(struct seq_file *seq, void *v)
2311 {
2312         struct tcp_iter_state* st;
2313         int len;
2314
2315         if (v == SEQ_START_TOKEN) {
2316                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2317                            "  sl  local_address rem_address   st tx_queue "
2318                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2319                            "inode");
2320                 goto out;
2321         }
2322         st = seq->private;
2323
2324         switch (st->state) {
2325         case TCP_SEQ_STATE_LISTENING:
2326         case TCP_SEQ_STATE_ESTABLISHED:
2327                 get_tcp4_sock(v, seq, st->num, &len);
2328                 break;
2329         case TCP_SEQ_STATE_OPENREQ:
2330                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2331                 break;
2332         case TCP_SEQ_STATE_TIME_WAIT:
2333                 get_timewait4_sock(v, seq, st->num, &len);
2334                 break;
2335         }
2336         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2337 out:
2338         return 0;
2339 }
2340
2341 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2342         .name           = "tcp",
2343         .family         = AF_INET,
2344         .seq_fops       = {
2345                 .owner          = THIS_MODULE,
2346         },
2347         .seq_ops        = {
2348                 .show           = tcp4_seq_show,
2349         },
2350 };
2351
2352 static int tcp4_proc_init_net(struct net *net)
2353 {
2354         return tcp_proc_register(net, &tcp4_seq_afinfo);
2355 }
2356
2357 static void tcp4_proc_exit_net(struct net *net)
2358 {
2359         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2360 }
2361
2362 static struct pernet_operations tcp4_net_ops = {
2363         .init = tcp4_proc_init_net,
2364         .exit = tcp4_proc_exit_net,
2365 };
2366
2367 int __init tcp4_proc_init(void)
2368 {
2369         return register_pernet_subsys(&tcp4_net_ops);
2370 }
2371
2372 void tcp4_proc_exit(void)
2373 {
2374         unregister_pernet_subsys(&tcp4_net_ops);
2375 }
2376 #endif /* CONFIG_PROC_FS */
2377
2378 struct proto tcp_prot = {
2379         .name                   = "TCP",
2380         .owner                  = THIS_MODULE,
2381         .close                  = tcp_close,
2382         .connect                = tcp_v4_connect,
2383         .disconnect             = tcp_disconnect,
2384         .accept                 = inet_csk_accept,
2385         .ioctl                  = tcp_ioctl,
2386         .init                   = tcp_v4_init_sock,
2387         .destroy                = tcp_v4_destroy_sock,
2388         .shutdown               = tcp_shutdown,
2389         .setsockopt             = tcp_setsockopt,
2390         .getsockopt             = tcp_getsockopt,
2391         .recvmsg                = tcp_recvmsg,
2392         .backlog_rcv            = tcp_v4_do_rcv,
2393         .hash                   = inet_hash,
2394         .unhash                 = inet_unhash,
2395         .get_port               = inet_csk_get_port,
2396         .enter_memory_pressure  = tcp_enter_memory_pressure,
2397         .sockets_allocated      = &tcp_sockets_allocated,
2398         .orphan_count           = &tcp_orphan_count,
2399         .memory_allocated       = &tcp_memory_allocated,
2400         .memory_pressure        = &tcp_memory_pressure,
2401         .sysctl_mem             = sysctl_tcp_mem,
2402         .sysctl_wmem            = sysctl_tcp_wmem,
2403         .sysctl_rmem            = sysctl_tcp_rmem,
2404         .max_header             = MAX_TCP_HEADER,
2405         .obj_size               = sizeof(struct tcp_sock),
2406         .twsk_prot              = &tcp_timewait_sock_ops,
2407         .rsk_prot               = &tcp_request_sock_ops,
2408         .h.hashinfo             = &tcp_hashinfo,
2409 #ifdef CONFIG_COMPAT
2410         .compat_setsockopt      = compat_tcp_setsockopt,
2411         .compat_getsockopt      = compat_tcp_getsockopt,
2412 #endif
2413 };
2414
2415
2416 static int __net_init tcp_sk_init(struct net *net)
2417 {
2418         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2419                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2420 }
2421
2422 static void __net_exit tcp_sk_exit(struct net *net)
2423 {
2424         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2425 }
2426
2427 static struct pernet_operations __net_initdata tcp_sk_ops = {
2428        .init = tcp_sk_init,
2429        .exit = tcp_sk_exit,
2430 };
2431
2432 void __init tcp_v4_init(void)
2433 {
2434         if (register_pernet_device(&tcp_sk_ops))
2435                 panic("Failed to create the TCP control socket.\n");
2436 }
2437
2438 EXPORT_SYMBOL(ipv4_specific);
2439 EXPORT_SYMBOL(tcp_hashinfo);
2440 EXPORT_SYMBOL(tcp_prot);
2441 EXPORT_SYMBOL(tcp_v4_conn_request);
2442 EXPORT_SYMBOL(tcp_v4_connect);
2443 EXPORT_SYMBOL(tcp_v4_do_rcv);
2444 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2445 EXPORT_SYMBOL(tcp_v4_send_check);
2446 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2447
2448 #ifdef CONFIG_PROC_FS
2449 EXPORT_SYMBOL(tcp_proc_register);
2450 EXPORT_SYMBOL(tcp_proc_unregister);
2451 #endif
2452 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2453