[NET]: Split skb->csum
[safe/jmp/linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen semantics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/transp_v6.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/timewait_sock.h>
72 #include <net/xfrm.h>
73 #include <net/netdma.h>
74
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80
81 #include <linux/crypto.h>
82 #include <linux/scatterlist.h>
83
84 int sysctl_tcp_tw_reuse __read_mostly;
85 int sysctl_tcp_low_latency __read_mostly;
86
87 /* Check TCP sequence numbers in ICMP packets. */
88 #define ICMP_MIN_LENGTH 8
89
90 /* Socket used for sending RSTs */
91 static struct socket *tcp_socket;
92
93 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
94
95 #ifdef CONFIG_TCP_MD5SIG
96 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
97                                                    __be32 addr);
98 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
99                                    __be32 saddr, __be32 daddr,
100                                    struct tcphdr *th, int protocol,
101                                    int tcplen);
102 #endif
103
104 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
105         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
106         .lhash_users = ATOMIC_INIT(0),
107         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
108 };
109
110 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
111 {
112         return inet_csk_get_port(&tcp_hashinfo, sk, snum,
113                                  inet_csk_bind_conflict);
114 }
115
116 static void tcp_v4_hash(struct sock *sk)
117 {
118         inet_hash(&tcp_hashinfo, sk);
119 }
120
121 void tcp_unhash(struct sock *sk)
122 {
123         inet_unhash(&tcp_hashinfo, sk);
124 }
125
126 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
127 {
128         return secure_tcp_sequence_number(skb->nh.iph->daddr,
129                                           skb->nh.iph->saddr,
130                                           skb->h.th->dest,
131                                           skb->h.th->source);
132 }
133
134 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
135 {
136         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
137         struct tcp_sock *tp = tcp_sk(sk);
138
139         /* With PAWS, it is safe from the viewpoint
140            of data integrity. Even without PAWS it is safe provided sequence
141            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142
143            Actually, the idea is close to VJ's one, only timestamp cache is
144            held not per host, but per port pair and TW bucket is used as state
145            holder.
146
147            If TW bucket has been already destroyed we fall back to VJ's scheme
148            and use initial timestamp retrieved from peer table.
149          */
150         if (tcptw->tw_ts_recent_stamp &&
151             (twp == NULL || (sysctl_tcp_tw_reuse &&
152                              xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
153                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
154                 if (tp->write_seq == 0)
155                         tp->write_seq = 1;
156                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
157                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
158                 sock_hold(sktw);
159                 return 1;
160         }
161
162         return 0;
163 }
164
165 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
166
167 /* This will initiate an outgoing connection. */
168 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
169 {
170         struct inet_sock *inet = inet_sk(sk);
171         struct tcp_sock *tp = tcp_sk(sk);
172         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
173         struct rtable *rt;
174         __be32 daddr, nexthop;
175         int tmp;
176         int err;
177
178         if (addr_len < sizeof(struct sockaddr_in))
179                 return -EINVAL;
180
181         if (usin->sin_family != AF_INET)
182                 return -EAFNOSUPPORT;
183
184         nexthop = daddr = usin->sin_addr.s_addr;
185         if (inet->opt && inet->opt->srr) {
186                 if (!daddr)
187                         return -EINVAL;
188                 nexthop = inet->opt->faddr;
189         }
190
191         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
192                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
193                                IPPROTO_TCP,
194                                inet->sport, usin->sin_port, sk);
195         if (tmp < 0)
196                 return tmp;
197
198         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
199                 ip_rt_put(rt);
200                 return -ENETUNREACH;
201         }
202
203         if (!inet->opt || !inet->opt->srr)
204                 daddr = rt->rt_dst;
205
206         if (!inet->saddr)
207                 inet->saddr = rt->rt_src;
208         inet->rcv_saddr = inet->saddr;
209
210         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
211                 /* Reset inherited state */
212                 tp->rx_opt.ts_recent       = 0;
213                 tp->rx_opt.ts_recent_stamp = 0;
214                 tp->write_seq              = 0;
215         }
216
217         if (tcp_death_row.sysctl_tw_recycle &&
218             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
219                 struct inet_peer *peer = rt_get_peer(rt);
220                 /*
221                  * VJ's idea. We save last timestamp seen from
222                  * the destination in peer table, when entering state
223                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
224                  * when trying new connection.
225                  */
226                 if (peer != NULL &&
227                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
228                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
229                         tp->rx_opt.ts_recent = peer->tcp_ts;
230                 }
231         }
232
233         inet->dport = usin->sin_port;
234         inet->daddr = daddr;
235
236         inet_csk(sk)->icsk_ext_hdr_len = 0;
237         if (inet->opt)
238                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
239
240         tp->rx_opt.mss_clamp = 536;
241
242         /* Socket identity is still unknown (sport may be zero).
243          * However we set state to SYN-SENT and not releasing socket
244          * lock select source port, enter ourselves into the hash tables and
245          * complete initialization after this.
246          */
247         tcp_set_state(sk, TCP_SYN_SENT);
248         err = inet_hash_connect(&tcp_death_row, sk);
249         if (err)
250                 goto failure;
251
252         err = ip_route_newports(&rt, IPPROTO_TCP,
253                                 inet->sport, inet->dport, sk);
254         if (err)
255                 goto failure;
256
257         /* OK, now commit destination to socket.  */
258         sk->sk_gso_type = SKB_GSO_TCPV4;
259         sk_setup_caps(sk, &rt->u.dst);
260
261         if (!tp->write_seq)
262                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
263                                                            inet->daddr,
264                                                            inet->sport,
265                                                            usin->sin_port);
266
267         inet->id = tp->write_seq ^ jiffies;
268
269         err = tcp_connect(sk);
270         rt = NULL;
271         if (err)
272                 goto failure;
273
274         return 0;
275
276 failure:
277         /*
278          * This unhashes the socket and releases the local port,
279          * if necessary.
280          */
281         tcp_set_state(sk, TCP_CLOSE);
282         ip_rt_put(rt);
283         sk->sk_route_caps = 0;
284         inet->dport = 0;
285         return err;
286 }
287
288 /*
289  * This routine does path mtu discovery as defined in RFC1191.
290  */
291 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
292 {
293         struct dst_entry *dst;
294         struct inet_sock *inet = inet_sk(sk);
295
296         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
297          * send out by Linux are always <576bytes so they should go through
298          * unfragmented).
299          */
300         if (sk->sk_state == TCP_LISTEN)
301                 return;
302
303         /* We don't check in the destentry if pmtu discovery is forbidden
304          * on this route. We just assume that no packet_to_big packets
305          * are send back when pmtu discovery is not active.
306          * There is a small race when the user changes this flag in the
307          * route, but I think that's acceptable.
308          */
309         if ((dst = __sk_dst_check(sk, 0)) == NULL)
310                 return;
311
312         dst->ops->update_pmtu(dst, mtu);
313
314         /* Something is about to be wrong... Remember soft error
315          * for the case, if this connection will not able to recover.
316          */
317         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
318                 sk->sk_err_soft = EMSGSIZE;
319
320         mtu = dst_mtu(dst);
321
322         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
323             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
324                 tcp_sync_mss(sk, mtu);
325
326                 /* Resend the TCP packet because it's
327                  * clear that the old packet has been
328                  * dropped. This is the new "fast" path mtu
329                  * discovery.
330                  */
331                 tcp_simple_retransmit(sk);
332         } /* else let the usual retransmit timer handle it */
333 }
334
335 /*
336  * This routine is called by the ICMP module when it gets some
337  * sort of error condition.  If err < 0 then the socket should
338  * be closed and the error returned to the user.  If err > 0
339  * it's just the icmp type << 8 | icmp code.  After adjustment
340  * header points to the first 8 bytes of the tcp header.  We need
341  * to find the appropriate port.
342  *
343  * The locking strategy used here is very "optimistic". When
344  * someone else accesses the socket the ICMP is just dropped
345  * and for some paths there is no check at all.
346  * A more general error queue to queue errors for later handling
347  * is probably better.
348  *
349  */
350
351 void tcp_v4_err(struct sk_buff *skb, u32 info)
352 {
353         struct iphdr *iph = (struct iphdr *)skb->data;
354         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
355         struct tcp_sock *tp;
356         struct inet_sock *inet;
357         int type = skb->h.icmph->type;
358         int code = skb->h.icmph->code;
359         struct sock *sk;
360         __u32 seq;
361         int err;
362
363         if (skb->len < (iph->ihl << 2) + 8) {
364                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
365                 return;
366         }
367
368         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
369                          th->source, inet_iif(skb));
370         if (!sk) {
371                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
372                 return;
373         }
374         if (sk->sk_state == TCP_TIME_WAIT) {
375                 inet_twsk_put(inet_twsk(sk));
376                 return;
377         }
378
379         bh_lock_sock(sk);
380         /* If too many ICMPs get dropped on busy
381          * servers this needs to be solved differently.
382          */
383         if (sock_owned_by_user(sk))
384                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
385
386         if (sk->sk_state == TCP_CLOSE)
387                 goto out;
388
389         tp = tcp_sk(sk);
390         seq = ntohl(th->seq);
391         if (sk->sk_state != TCP_LISTEN &&
392             !between(seq, tp->snd_una, tp->snd_nxt)) {
393                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
394                 goto out;
395         }
396
397         switch (type) {
398         case ICMP_SOURCE_QUENCH:
399                 /* Just silently ignore these. */
400                 goto out;
401         case ICMP_PARAMETERPROB:
402                 err = EPROTO;
403                 break;
404         case ICMP_DEST_UNREACH:
405                 if (code > NR_ICMP_UNREACH)
406                         goto out;
407
408                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
409                         if (!sock_owned_by_user(sk))
410                                 do_pmtu_discovery(sk, iph, info);
411                         goto out;
412                 }
413
414                 err = icmp_err_convert[code].errno;
415                 break;
416         case ICMP_TIME_EXCEEDED:
417                 err = EHOSTUNREACH;
418                 break;
419         default:
420                 goto out;
421         }
422
423         switch (sk->sk_state) {
424                 struct request_sock *req, **prev;
425         case TCP_LISTEN:
426                 if (sock_owned_by_user(sk))
427                         goto out;
428
429                 req = inet_csk_search_req(sk, &prev, th->dest,
430                                           iph->daddr, iph->saddr);
431                 if (!req)
432                         goto out;
433
434                 /* ICMPs are not backlogged, hence we cannot get
435                    an established socket here.
436                  */
437                 BUG_TRAP(!req->sk);
438
439                 if (seq != tcp_rsk(req)->snt_isn) {
440                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
441                         goto out;
442                 }
443
444                 /*
445                  * Still in SYN_RECV, just remove it silently.
446                  * There is no good way to pass the error to the newly
447                  * created socket, and POSIX does not want network
448                  * errors returned from accept().
449                  */
450                 inet_csk_reqsk_queue_drop(sk, req, prev);
451                 goto out;
452
453         case TCP_SYN_SENT:
454         case TCP_SYN_RECV:  /* Cannot happen.
455                                It can f.e. if SYNs crossed.
456                              */
457                 if (!sock_owned_by_user(sk)) {
458                         sk->sk_err = err;
459
460                         sk->sk_error_report(sk);
461
462                         tcp_done(sk);
463                 } else {
464                         sk->sk_err_soft = err;
465                 }
466                 goto out;
467         }
468
469         /* If we've already connected we will keep trying
470          * until we time out, or the user gives up.
471          *
472          * rfc1122 4.2.3.9 allows to consider as hard errors
473          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
474          * but it is obsoleted by pmtu discovery).
475          *
476          * Note, that in modern internet, where routing is unreliable
477          * and in each dark corner broken firewalls sit, sending random
478          * errors ordered by their masters even this two messages finally lose
479          * their original sense (even Linux sends invalid PORT_UNREACHs)
480          *
481          * Now we are in compliance with RFCs.
482          *                                                      --ANK (980905)
483          */
484
485         inet = inet_sk(sk);
486         if (!sock_owned_by_user(sk) && inet->recverr) {
487                 sk->sk_err = err;
488                 sk->sk_error_report(sk);
489         } else  { /* Only an error on timeout */
490                 sk->sk_err_soft = err;
491         }
492
493 out:
494         bh_unlock_sock(sk);
495         sock_put(sk);
496 }
497
498 /* This routine computes an IPv4 TCP checksum. */
499 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
500 {
501         struct inet_sock *inet = inet_sk(sk);
502         struct tcphdr *th = skb->h.th;
503
504         if (skb->ip_summed == CHECKSUM_PARTIAL) {
505                 th->check = ~tcp_v4_check(th, len,
506                                           inet->saddr, inet->daddr, 0);
507                 skb->csum_offset = offsetof(struct tcphdr, check);
508         } else {
509                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
510                                          csum_partial((char *)th,
511                                                       th->doff << 2,
512                                                       skb->csum));
513         }
514 }
515
516 int tcp_v4_gso_send_check(struct sk_buff *skb)
517 {
518         struct iphdr *iph;
519         struct tcphdr *th;
520
521         if (!pskb_may_pull(skb, sizeof(*th)))
522                 return -EINVAL;
523
524         iph = skb->nh.iph;
525         th = skb->h.th;
526
527         th->check = 0;
528         th->check = ~tcp_v4_check(th, skb->len, iph->saddr, iph->daddr, 0);
529         skb->csum_offset = offsetof(struct tcphdr, check);
530         skb->ip_summed = CHECKSUM_PARTIAL;
531         return 0;
532 }
533
534 /*
535  *      This routine will send an RST to the other tcp.
536  *
537  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
538  *                    for reset.
539  *      Answer: if a packet caused RST, it is not for a socket
540  *              existing in our system, if it is matched to a socket,
541  *              it is just duplicate segment or bug in other side's TCP.
542  *              So that we build reply only basing on parameters
543  *              arrived with segment.
544  *      Exception: precedence violation. We do not implement it in any case.
545  */
546
547 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
548 {
549         struct tcphdr *th = skb->h.th;
550         struct {
551                 struct tcphdr th;
552 #ifdef CONFIG_TCP_MD5SIG
553                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
554 #endif
555         } rep;
556         struct ip_reply_arg arg;
557 #ifdef CONFIG_TCP_MD5SIG
558         struct tcp_md5sig_key *key;
559 #endif
560
561         /* Never send a reset in response to a reset. */
562         if (th->rst)
563                 return;
564
565         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
566                 return;
567
568         /* Swap the send and the receive. */
569         memset(&rep, 0, sizeof(rep));
570         rep.th.dest   = th->source;
571         rep.th.source = th->dest;
572         rep.th.doff   = sizeof(struct tcphdr) / 4;
573         rep.th.rst    = 1;
574
575         if (th->ack) {
576                 rep.th.seq = th->ack_seq;
577         } else {
578                 rep.th.ack = 1;
579                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
580                                        skb->len - (th->doff << 2));
581         }
582
583         memset(&arg, 0, sizeof(arg));
584         arg.iov[0].iov_base = (unsigned char *)&rep;
585         arg.iov[0].iov_len  = sizeof(rep.th);
586
587 #ifdef CONFIG_TCP_MD5SIG
588         key = sk ? tcp_v4_md5_do_lookup(sk, skb->nh.iph->daddr) : NULL;
589         if (key) {
590                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
591                                    (TCPOPT_NOP << 16) |
592                                    (TCPOPT_MD5SIG << 8) |
593                                    TCPOLEN_MD5SIG);
594                 /* Update length and the length the header thinks exists */
595                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
596                 rep.th.doff = arg.iov[0].iov_len / 4;
597
598                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
599                                         key,
600                                         skb->nh.iph->daddr,
601                                         skb->nh.iph->saddr,
602                                         &rep.th, IPPROTO_TCP,
603                                         arg.iov[0].iov_len);
604         }
605 #endif
606         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
607                                       skb->nh.iph->saddr, /* XXX */
608                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
609         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
610
611         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
612
613         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
614         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
615 }
616
617 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
618    outside socket context is ugly, certainly. What can I do?
619  */
620
621 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
622                             struct sk_buff *skb, u32 seq, u32 ack,
623                             u32 win, u32 ts)
624 {
625         struct tcphdr *th = skb->h.th;
626         struct {
627                 struct tcphdr th;
628                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
629 #ifdef CONFIG_TCP_MD5SIG
630                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
631 #endif
632                         ];
633         } rep;
634         struct ip_reply_arg arg;
635 #ifdef CONFIG_TCP_MD5SIG
636         struct tcp_md5sig_key *key;
637         struct tcp_md5sig_key tw_key;
638 #endif
639
640         memset(&rep.th, 0, sizeof(struct tcphdr));
641         memset(&arg, 0, sizeof(arg));
642
643         arg.iov[0].iov_base = (unsigned char *)&rep;
644         arg.iov[0].iov_len  = sizeof(rep.th);
645         if (ts) {
646                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
647                                    (TCPOPT_TIMESTAMP << 8) |
648                                    TCPOLEN_TIMESTAMP);
649                 rep.opt[1] = htonl(tcp_time_stamp);
650                 rep.opt[2] = htonl(ts);
651                 arg.iov[0].iov_len = TCPOLEN_TSTAMP_ALIGNED;
652         }
653
654         /* Swap the send and the receive. */
655         rep.th.dest    = th->source;
656         rep.th.source  = th->dest;
657         rep.th.doff    = arg.iov[0].iov_len / 4;
658         rep.th.seq     = htonl(seq);
659         rep.th.ack_seq = htonl(ack);
660         rep.th.ack     = 1;
661         rep.th.window  = htons(win);
662
663 #ifdef CONFIG_TCP_MD5SIG
664         /*
665          * The SKB holds an imcoming packet, but may not have a valid ->sk
666          * pointer. This is especially the case when we're dealing with a
667          * TIME_WAIT ack, because the sk structure is long gone, and only
668          * the tcp_timewait_sock remains. So the md5 key is stashed in that
669          * structure, and we use it in preference.  I believe that (twsk ||
670          * skb->sk) holds true, but we program defensively.
671          */
672         if (!twsk && skb->sk) {
673                 key = tcp_v4_md5_do_lookup(skb->sk, skb->nh.iph->daddr);
674         } else if (twsk && twsk->tw_md5_keylen) {
675                 tw_key.key = twsk->tw_md5_key;
676                 tw_key.keylen = twsk->tw_md5_keylen;
677                 key = &tw_key;
678         } else
679                 key = NULL;
680
681         if (key) {
682                 int offset = (ts) ? 3 : 0;
683
684                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
685                                           (TCPOPT_NOP << 16) |
686                                           (TCPOPT_MD5SIG << 8) |
687                                           TCPOLEN_MD5SIG);
688                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
689                 rep.th.doff = arg.iov[0].iov_len/4;
690
691                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
692                                         key,
693                                         skb->nh.iph->daddr,
694                                         skb->nh.iph->saddr,
695                                         &rep.th, IPPROTO_TCP,
696                                         arg.iov[0].iov_len);
697         }
698 #endif
699         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
700                                       skb->nh.iph->saddr, /* XXX */
701                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
702         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
703
704         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
705
706         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
707 }
708
709 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
710 {
711         struct inet_timewait_sock *tw = inet_twsk(sk);
712         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
713
714         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
715                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
716                         tcptw->tw_ts_recent);
717
718         inet_twsk_put(tw);
719 }
720
721 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
722                                   struct request_sock *req)
723 {
724         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
725                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
726                         req->ts_recent);
727 }
728
729 /*
730  *      Send a SYN-ACK after having received an ACK.
731  *      This still operates on a request_sock only, not on a big
732  *      socket.
733  */
734 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
735                               struct dst_entry *dst)
736 {
737         const struct inet_request_sock *ireq = inet_rsk(req);
738         int err = -1;
739         struct sk_buff * skb;
740
741         /* First, grab a route. */
742         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
743                 goto out;
744
745         skb = tcp_make_synack(sk, dst, req);
746
747         if (skb) {
748                 struct tcphdr *th = skb->h.th;
749
750                 th->check = tcp_v4_check(th, skb->len,
751                                          ireq->loc_addr,
752                                          ireq->rmt_addr,
753                                          csum_partial((char *)th, skb->len,
754                                                       skb->csum));
755
756                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
757                                             ireq->rmt_addr,
758                                             ireq->opt);
759                 err = net_xmit_eval(err);
760         }
761
762 out:
763         dst_release(dst);
764         return err;
765 }
766
767 /*
768  *      IPv4 request_sock destructor.
769  */
770 static void tcp_v4_reqsk_destructor(struct request_sock *req)
771 {
772         kfree(inet_rsk(req)->opt);
773 }
774
775 #ifdef CONFIG_SYN_COOKIES
776 static void syn_flood_warning(struct sk_buff *skb)
777 {
778         static unsigned long warntime;
779
780         if (time_after(jiffies, (warntime + HZ * 60))) {
781                 warntime = jiffies;
782                 printk(KERN_INFO
783                        "possible SYN flooding on port %d. Sending cookies.\n",
784                        ntohs(skb->h.th->dest));
785         }
786 }
787 #endif
788
789 /*
790  * Save and compile IPv4 options into the request_sock if needed.
791  */
792 static struct ip_options *tcp_v4_save_options(struct sock *sk,
793                                               struct sk_buff *skb)
794 {
795         struct ip_options *opt = &(IPCB(skb)->opt);
796         struct ip_options *dopt = NULL;
797
798         if (opt && opt->optlen) {
799                 int opt_size = optlength(opt);
800                 dopt = kmalloc(opt_size, GFP_ATOMIC);
801                 if (dopt) {
802                         if (ip_options_echo(dopt, skb)) {
803                                 kfree(dopt);
804                                 dopt = NULL;
805                         }
806                 }
807         }
808         return dopt;
809 }
810
811 #ifdef CONFIG_TCP_MD5SIG
812 /*
813  * RFC2385 MD5 checksumming requires a mapping of
814  * IP address->MD5 Key.
815  * We need to maintain these in the sk structure.
816  */
817
818 /* Find the Key structure for an address.  */
819 static struct tcp_md5sig_key *
820                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
821 {
822         struct tcp_sock *tp = tcp_sk(sk);
823         int i;
824
825         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
826                 return NULL;
827         for (i = 0; i < tp->md5sig_info->entries4; i++) {
828                 if (tp->md5sig_info->keys4[i].addr == addr)
829                         return (struct tcp_md5sig_key *)
830                                                 &tp->md5sig_info->keys4[i];
831         }
832         return NULL;
833 }
834
835 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
836                                          struct sock *addr_sk)
837 {
838         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
839 }
840
841 EXPORT_SYMBOL(tcp_v4_md5_lookup);
842
843 struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
844                                                struct request_sock *req)
845 {
846         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
847 }
848
849 /* This can be called on a newly created socket, from other files */
850 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
851                       u8 *newkey, u8 newkeylen)
852 {
853         /* Add Key to the list */
854         struct tcp4_md5sig_key *key;
855         struct tcp_sock *tp = tcp_sk(sk);
856         struct tcp4_md5sig_key *keys;
857
858         key = (struct tcp4_md5sig_key *)tcp_v4_md5_do_lookup(sk, addr);
859         if (key) {
860                 /* Pre-existing entry - just update that one. */
861                 kfree(key->key);
862                 key->key = newkey;
863                 key->keylen = newkeylen;
864         } else {
865                 struct tcp_md5sig_info *md5sig;
866
867                 if (!tp->md5sig_info) {
868                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
869                                                   GFP_ATOMIC);
870                         if (!tp->md5sig_info) {
871                                 kfree(newkey);
872                                 return -ENOMEM;
873                         }
874                 }
875                 if (tcp_alloc_md5sig_pool() == NULL) {
876                         kfree(newkey);
877                         return -ENOMEM;
878                 }
879                 md5sig = tp->md5sig_info;
880
881                 if (md5sig->alloced4 == md5sig->entries4) {
882                         keys = kmalloc((sizeof(*keys) *
883                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
884                         if (!keys) {
885                                 kfree(newkey);
886                                 tcp_free_md5sig_pool();
887                                 return -ENOMEM;
888                         }
889
890                         if (md5sig->entries4)
891                                 memcpy(keys, md5sig->keys4,
892                                        sizeof(*keys) * md5sig->entries4);
893
894                         /* Free old key list, and reference new one */
895                         if (md5sig->keys4)
896                                 kfree(md5sig->keys4);
897                         md5sig->keys4 = keys;
898                         md5sig->alloced4++;
899                 }
900                 md5sig->entries4++;
901                 md5sig->keys4[md5sig->entries4 - 1].addr   = addr;
902                 md5sig->keys4[md5sig->entries4 - 1].key    = newkey;
903                 md5sig->keys4[md5sig->entries4 - 1].keylen = newkeylen;
904         }
905         return 0;
906 }
907
908 EXPORT_SYMBOL(tcp_v4_md5_do_add);
909
910 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
911                                u8 *newkey, u8 newkeylen)
912 {
913         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
914                                  newkey, newkeylen);
915 }
916
917 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
918 {
919         struct tcp_sock *tp = tcp_sk(sk);
920         int i;
921
922         for (i = 0; i < tp->md5sig_info->entries4; i++) {
923                 if (tp->md5sig_info->keys4[i].addr == addr) {
924                         /* Free the key */
925                         kfree(tp->md5sig_info->keys4[i].key);
926                         tp->md5sig_info->entries4--;
927
928                         if (tp->md5sig_info->entries4 == 0) {
929                                 kfree(tp->md5sig_info->keys4);
930                                 tp->md5sig_info->keys4 = NULL;
931                         } else if (tp->md5sig_info->entries4 != i) {
932                                 /* Need to do some manipulation */
933                                 memcpy(&tp->md5sig_info->keys4[i],
934                                        &tp->md5sig_info->keys4[i+1],
935                                        (tp->md5sig_info->entries4 - i) *
936                                         sizeof(struct tcp4_md5sig_key));
937                         }
938                         tcp_free_md5sig_pool();
939                         return 0;
940                 }
941         }
942         return -ENOENT;
943 }
944
945 EXPORT_SYMBOL(tcp_v4_md5_do_del);
946
947 static void tcp_v4_clear_md5_list(struct sock *sk)
948 {
949         struct tcp_sock *tp = tcp_sk(sk);
950
951         /* Free each key, then the set of key keys,
952          * the crypto element, and then decrement our
953          * hold on the last resort crypto.
954          */
955         if (tp->md5sig_info->entries4) {
956                 int i;
957                 for (i = 0; i < tp->md5sig_info->entries4; i++)
958                         kfree(tp->md5sig_info->keys4[i].key);
959                 tp->md5sig_info->entries4 = 0;
960                 tcp_free_md5sig_pool();
961         }
962         if (tp->md5sig_info->keys4) {
963                 kfree(tp->md5sig_info->keys4);
964                 tp->md5sig_info->keys4 = NULL;
965                 tp->md5sig_info->alloced4  = 0;
966         }
967 }
968
969 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
970                                  int optlen)
971 {
972         struct tcp_md5sig cmd;
973         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
974         u8 *newkey;
975
976         if (optlen < sizeof(cmd))
977                 return -EINVAL;
978
979         if (copy_from_user(&cmd, optval, sizeof(cmd)))
980                 return -EFAULT;
981
982         if (sin->sin_family != AF_INET)
983                 return -EINVAL;
984
985         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
986                 if (!tcp_sk(sk)->md5sig_info)
987                         return -ENOENT;
988                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
989         }
990
991         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
992                 return -EINVAL;
993
994         if (!tcp_sk(sk)->md5sig_info) {
995                 struct tcp_sock *tp = tcp_sk(sk);
996                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
997
998                 if (!p)
999                         return -EINVAL;
1000
1001                 tp->md5sig_info = p;
1002
1003         }
1004
1005         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1006         if (!newkey)
1007                 return -ENOMEM;
1008         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1009                                  newkey, cmd.tcpm_keylen);
1010 }
1011
1012 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1013                                    __be32 saddr, __be32 daddr,
1014                                    struct tcphdr *th, int protocol,
1015                                    int tcplen)
1016 {
1017         struct scatterlist sg[4];
1018         __u16 data_len;
1019         int block = 0;
1020 #ifdef CONFIG_TCP_MD5SIG_DEBUG
1021         int i;
1022 #endif
1023         __sum16 old_checksum;
1024         struct tcp_md5sig_pool *hp;
1025         struct tcp4_pseudohdr *bp;
1026         struct hash_desc *desc;
1027         int err;
1028         unsigned int nbytes = 0;
1029
1030         /*
1031          * Okay, so RFC2385 is turned on for this connection,
1032          * so we need to generate the MD5 hash for the packet now.
1033          */
1034
1035         hp = tcp_get_md5sig_pool();
1036         if (!hp)
1037                 goto clear_hash_noput;
1038
1039         bp = &hp->md5_blk.ip4;
1040         desc = &hp->md5_desc;
1041
1042         /*
1043          * 1. the TCP pseudo-header (in the order: source IP address,
1044          * destination IP address, zero-padded protocol number, and
1045          * segment length)
1046          */
1047         bp->saddr = saddr;
1048         bp->daddr = daddr;
1049         bp->pad = 0;
1050         bp->protocol = protocol;
1051         bp->len = htons(tcplen);
1052         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1053         nbytes += sizeof(*bp);
1054
1055 #ifdef CONFIG_TCP_MD5SIG_DEBUG
1056         printk("Calcuating hash for: ");
1057         for (i = 0; i < sizeof(*bp); i++)
1058                 printk("%02x ", (unsigned int)((unsigned char *)bp)[i]);
1059         printk(" ");
1060 #endif
1061
1062         /* 2. the TCP header, excluding options, and assuming a
1063          * checksum of zero/
1064          */
1065         old_checksum = th->check;
1066         th->check = 0;
1067         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1068         nbytes += sizeof(struct tcphdr);
1069 #ifdef CONFIG_TCP_MD5SIG_DEBUG
1070         for (i = 0; i < sizeof(struct tcphdr); i++)
1071                 printk(" %02x", (unsigned int)((unsigned char *)th)[i]);
1072 #endif
1073         /* 3. the TCP segment data (if any) */
1074         data_len = tcplen - (th->doff << 2);
1075         if (data_len > 0) {
1076                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1077                 sg_set_buf(&sg[block++], data, data_len);
1078                 nbytes += data_len;
1079         }
1080
1081         /* 4. an independently-specified key or password, known to both
1082          * TCPs and presumably connection-specific
1083          */
1084         sg_set_buf(&sg[block++], key->key, key->keylen);
1085         nbytes += key->keylen;
1086
1087 #ifdef CONFIG_TCP_MD5SIG_DEBUG
1088         printk("  and password: ");
1089         for (i = 0; i < key->keylen; i++)
1090                 printk("%02x ", (unsigned int)key->key[i]);
1091 #endif
1092
1093         /* Now store the Hash into the packet */
1094         err = crypto_hash_init(desc);
1095         if (err)
1096                 goto clear_hash;
1097         err = crypto_hash_update(desc, sg, nbytes);
1098         if (err)
1099                 goto clear_hash;
1100         err = crypto_hash_final(desc, md5_hash);
1101         if (err)
1102                 goto clear_hash;
1103
1104         /* Reset header, and free up the crypto */
1105         tcp_put_md5sig_pool();
1106         th->check = old_checksum;
1107
1108 out:
1109 #ifdef CONFIG_TCP_MD5SIG_DEBUG
1110         printk(" result:");
1111         for (i = 0; i < 16; i++)
1112                 printk(" %02x", (unsigned int)(((u8*)md5_hash)[i]));
1113         printk("\n");
1114 #endif
1115         return 0;
1116 clear_hash:
1117         tcp_put_md5sig_pool();
1118 clear_hash_noput:
1119         memset(md5_hash, 0, 16);
1120         goto out;
1121 }
1122
1123 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1124                          struct sock *sk,
1125                          struct dst_entry *dst,
1126                          struct request_sock *req,
1127                          struct tcphdr *th, int protocol,
1128                          int tcplen)
1129 {
1130         __be32 saddr, daddr;
1131
1132         if (sk) {
1133                 saddr = inet_sk(sk)->saddr;
1134                 daddr = inet_sk(sk)->daddr;
1135         } else {
1136                 struct rtable *rt = (struct rtable *)dst;
1137                 BUG_ON(!rt);
1138                 saddr = rt->rt_src;
1139                 daddr = rt->rt_dst;
1140         }
1141         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1142                                        saddr, daddr,
1143                                        th, protocol, tcplen);
1144 }
1145
1146 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1147
1148 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1149 {
1150         /*
1151          * This gets called for each TCP segment that arrives
1152          * so we want to be efficient.
1153          * We have 3 drop cases:
1154          * o No MD5 hash and one expected.
1155          * o MD5 hash and we're not expecting one.
1156          * o MD5 hash and its wrong.
1157          */
1158         __u8 *hash_location = NULL;
1159         struct tcp_md5sig_key *hash_expected;
1160         struct iphdr *iph = skb->nh.iph;
1161         struct tcphdr *th = skb->h.th;
1162         int length = (th->doff << 2) - sizeof(struct tcphdr);
1163         int genhash;
1164         unsigned char *ptr;
1165         unsigned char newhash[16];
1166
1167         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1168
1169         /*
1170          * If the TCP option length is less than the TCP_MD5SIG
1171          * option length, then we can shortcut
1172          */
1173         if (length < TCPOLEN_MD5SIG) {
1174                 if (hash_expected)
1175                         return 1;
1176                 else
1177                         return 0;
1178         }
1179
1180         /* Okay, we can't shortcut - we have to grub through the options */
1181         ptr = (unsigned char *)(th + 1);
1182         while (length > 0) {
1183                 int opcode = *ptr++;
1184                 int opsize;
1185
1186                 switch (opcode) {
1187                 case TCPOPT_EOL:
1188                         goto done_opts;
1189                 case TCPOPT_NOP:
1190                         length--;
1191                         continue;
1192                 default:
1193                         opsize = *ptr++;
1194                         if (opsize < 2)
1195                                 goto done_opts;
1196                         if (opsize > length)
1197                                 goto done_opts;
1198
1199                         if (opcode == TCPOPT_MD5SIG) {
1200                                 hash_location = ptr;
1201                                 goto done_opts;
1202                         }
1203                 }
1204                 ptr += opsize-2;
1205                 length -= opsize;
1206         }
1207 done_opts:
1208         /* We've parsed the options - do we have a hash? */
1209         if (!hash_expected && !hash_location)
1210                 return 0;
1211
1212         if (hash_expected && !hash_location) {
1213                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1214                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1215                                NIPQUAD(iph->saddr), ntohs(th->source),
1216                                NIPQUAD(iph->daddr), ntohs(th->dest));
1217                 return 1;
1218         }
1219
1220         if (!hash_expected && hash_location) {
1221                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1222                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1223                                NIPQUAD(iph->saddr), ntohs(th->source),
1224                                NIPQUAD(iph->daddr), ntohs(th->dest));
1225                 return 1;
1226         }
1227
1228         /* Okay, so this is hash_expected and hash_location -
1229          * so we need to calculate the checksum.
1230          */
1231         genhash = tcp_v4_do_calc_md5_hash(newhash,
1232                                           hash_expected,
1233                                           iph->saddr, iph->daddr,
1234                                           th, sk->sk_protocol,
1235                                           skb->len);
1236
1237         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1238                 if (net_ratelimit()) {
1239                         printk(KERN_INFO "MD5 Hash failed for "
1240                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1241                                NIPQUAD(iph->saddr), ntohs(th->source),
1242                                NIPQUAD(iph->daddr), ntohs(th->dest),
1243                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1244 #ifdef CONFIG_TCP_MD5SIG_DEBUG
1245                         do {
1246                                 int i;
1247                                 printk("Received: ");
1248                                 for (i = 0; i < 16; i++)
1249                                         printk("%02x ",
1250                                                0xff & (int)hash_location[i]);
1251                                 printk("\n");
1252                                 printk("Calculated: ");
1253                                 for (i = 0; i < 16; i++)
1254                                         printk("%02x ", 0xff & (int)newhash[i]);
1255                                 printk("\n");
1256                         } while(0);
1257 #endif
1258                 }
1259                 return 1;
1260         }
1261         return 0;
1262 }
1263
1264 #endif
1265
1266 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1267         .family         =       PF_INET,
1268         .obj_size       =       sizeof(struct tcp_request_sock),
1269         .rtx_syn_ack    =       tcp_v4_send_synack,
1270         .send_ack       =       tcp_v4_reqsk_send_ack,
1271         .destructor     =       tcp_v4_reqsk_destructor,
1272         .send_reset     =       tcp_v4_send_reset,
1273 };
1274
1275 struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1276 #ifdef CONFIG_TCP_MD5SIG
1277         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1278 #endif
1279 };
1280
1281 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1282         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1283         .twsk_unique    = tcp_twsk_unique,
1284         .twsk_destructor= tcp_twsk_destructor,
1285 };
1286
1287 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1288 {
1289         struct inet_request_sock *ireq;
1290         struct tcp_options_received tmp_opt;
1291         struct request_sock *req;
1292         __be32 saddr = skb->nh.iph->saddr;
1293         __be32 daddr = skb->nh.iph->daddr;
1294         __u32 isn = TCP_SKB_CB(skb)->when;
1295         struct dst_entry *dst = NULL;
1296 #ifdef CONFIG_SYN_COOKIES
1297         int want_cookie = 0;
1298 #else
1299 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1300 #endif
1301
1302         /* Never answer to SYNs send to broadcast or multicast */
1303         if (((struct rtable *)skb->dst)->rt_flags &
1304             (RTCF_BROADCAST | RTCF_MULTICAST))
1305                 goto drop;
1306
1307         /* TW buckets are converted to open requests without
1308          * limitations, they conserve resources and peer is
1309          * evidently real one.
1310          */
1311         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1312 #ifdef CONFIG_SYN_COOKIES
1313                 if (sysctl_tcp_syncookies) {
1314                         want_cookie = 1;
1315                 } else
1316 #endif
1317                 goto drop;
1318         }
1319
1320         /* Accept backlog is full. If we have already queued enough
1321          * of warm entries in syn queue, drop request. It is better than
1322          * clogging syn queue with openreqs with exponentially increasing
1323          * timeout.
1324          */
1325         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1326                 goto drop;
1327
1328         req = reqsk_alloc(&tcp_request_sock_ops);
1329         if (!req)
1330                 goto drop;
1331
1332 #ifdef CONFIG_TCP_MD5SIG
1333         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1334 #endif
1335
1336         tcp_clear_options(&tmp_opt);
1337         tmp_opt.mss_clamp = 536;
1338         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1339
1340         tcp_parse_options(skb, &tmp_opt, 0);
1341
1342         if (want_cookie) {
1343                 tcp_clear_options(&tmp_opt);
1344                 tmp_opt.saw_tstamp = 0;
1345         }
1346
1347         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1348                 /* Some OSes (unknown ones, but I see them on web server, which
1349                  * contains information interesting only for windows'
1350                  * users) do not send their stamp in SYN. It is easy case.
1351                  * We simply do not advertise TS support.
1352                  */
1353                 tmp_opt.saw_tstamp = 0;
1354                 tmp_opt.tstamp_ok  = 0;
1355         }
1356         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1357
1358         tcp_openreq_init(req, &tmp_opt, skb);
1359
1360         if (security_inet_conn_request(sk, skb, req))
1361                 goto drop_and_free;
1362
1363         ireq = inet_rsk(req);
1364         ireq->loc_addr = daddr;
1365         ireq->rmt_addr = saddr;
1366         ireq->opt = tcp_v4_save_options(sk, skb);
1367         if (!want_cookie)
1368                 TCP_ECN_create_request(req, skb->h.th);
1369
1370         if (want_cookie) {
1371 #ifdef CONFIG_SYN_COOKIES
1372                 syn_flood_warning(skb);
1373 #endif
1374                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1375         } else if (!isn) {
1376                 struct inet_peer *peer = NULL;
1377
1378                 /* VJ's idea. We save last timestamp seen
1379                  * from the destination in peer table, when entering
1380                  * state TIME-WAIT, and check against it before
1381                  * accepting new connection request.
1382                  *
1383                  * If "isn" is not zero, this request hit alive
1384                  * timewait bucket, so that all the necessary checks
1385                  * are made in the function processing timewait state.
1386                  */
1387                 if (tmp_opt.saw_tstamp &&
1388                     tcp_death_row.sysctl_tw_recycle &&
1389                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1390                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1391                     peer->v4daddr == saddr) {
1392                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1393                             (s32)(peer->tcp_ts - req->ts_recent) >
1394                                                         TCP_PAWS_WINDOW) {
1395                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1396                                 dst_release(dst);
1397                                 goto drop_and_free;
1398                         }
1399                 }
1400                 /* Kill the following clause, if you dislike this way. */
1401                 else if (!sysctl_tcp_syncookies &&
1402                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1403                           (sysctl_max_syn_backlog >> 2)) &&
1404                          (!peer || !peer->tcp_ts_stamp) &&
1405                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1406                         /* Without syncookies last quarter of
1407                          * backlog is filled with destinations,
1408                          * proven to be alive.
1409                          * It means that we continue to communicate
1410                          * to destinations, already remembered
1411                          * to the moment of synflood.
1412                          */
1413                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1414                                        "request from %u.%u.%u.%u/%u\n",
1415                                        NIPQUAD(saddr),
1416                                        ntohs(skb->h.th->source));
1417                         dst_release(dst);
1418                         goto drop_and_free;
1419                 }
1420
1421                 isn = tcp_v4_init_sequence(skb);
1422         }
1423         tcp_rsk(req)->snt_isn = isn;
1424
1425         if (tcp_v4_send_synack(sk, req, dst))
1426                 goto drop_and_free;
1427
1428         if (want_cookie) {
1429                 reqsk_free(req);
1430         } else {
1431                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1432         }
1433         return 0;
1434
1435 drop_and_free:
1436         reqsk_free(req);
1437 drop:
1438         return 0;
1439 }
1440
1441
1442 /*
1443  * The three way handshake has completed - we got a valid synack -
1444  * now create the new socket.
1445  */
1446 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1447                                   struct request_sock *req,
1448                                   struct dst_entry *dst)
1449 {
1450         struct inet_request_sock *ireq;
1451         struct inet_sock *newinet;
1452         struct tcp_sock *newtp;
1453         struct sock *newsk;
1454 #ifdef CONFIG_TCP_MD5SIG
1455         struct tcp_md5sig_key *key;
1456 #endif
1457
1458         if (sk_acceptq_is_full(sk))
1459                 goto exit_overflow;
1460
1461         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1462                 goto exit;
1463
1464         newsk = tcp_create_openreq_child(sk, req, skb);
1465         if (!newsk)
1466                 goto exit;
1467
1468         newsk->sk_gso_type = SKB_GSO_TCPV4;
1469         sk_setup_caps(newsk, dst);
1470
1471         newtp                 = tcp_sk(newsk);
1472         newinet               = inet_sk(newsk);
1473         ireq                  = inet_rsk(req);
1474         newinet->daddr        = ireq->rmt_addr;
1475         newinet->rcv_saddr    = ireq->loc_addr;
1476         newinet->saddr        = ireq->loc_addr;
1477         newinet->opt          = ireq->opt;
1478         ireq->opt             = NULL;
1479         newinet->mc_index     = inet_iif(skb);
1480         newinet->mc_ttl       = skb->nh.iph->ttl;
1481         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1482         if (newinet->opt)
1483                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1484         newinet->id = newtp->write_seq ^ jiffies;
1485
1486         tcp_mtup_init(newsk);
1487         tcp_sync_mss(newsk, dst_mtu(dst));
1488         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1489         tcp_initialize_rcv_mss(newsk);
1490
1491 #ifdef CONFIG_TCP_MD5SIG
1492         /* Copy over the MD5 key from the original socket */
1493         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1494                 /*
1495                  * We're using one, so create a matching key
1496                  * on the newsk structure. If we fail to get
1497                  * memory, then we end up not copying the key
1498                  * across. Shucks.
1499                  */
1500                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1501                 if (newkey != NULL)
1502                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1503                                           newkey, key->keylen);
1504         }
1505 #endif
1506
1507         __inet_hash(&tcp_hashinfo, newsk, 0);
1508         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1509
1510         return newsk;
1511
1512 exit_overflow:
1513         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1514 exit:
1515         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1516         dst_release(dst);
1517         return NULL;
1518 }
1519
1520 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1521 {
1522         struct tcphdr *th = skb->h.th;
1523         struct iphdr *iph = skb->nh.iph;
1524         struct sock *nsk;
1525         struct request_sock **prev;
1526         /* Find possible connection requests. */
1527         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1528                                                        iph->saddr, iph->daddr);
1529         if (req)
1530                 return tcp_check_req(sk, skb, req, prev);
1531
1532         nsk = inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1533                                       th->source, skb->nh.iph->daddr,
1534                                       th->dest, inet_iif(skb));
1535
1536         if (nsk) {
1537                 if (nsk->sk_state != TCP_TIME_WAIT) {
1538                         bh_lock_sock(nsk);
1539                         return nsk;
1540                 }
1541                 inet_twsk_put(inet_twsk(nsk));
1542                 return NULL;
1543         }
1544
1545 #ifdef CONFIG_SYN_COOKIES
1546         if (!th->rst && !th->syn && th->ack)
1547                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1548 #endif
1549         return sk;
1550 }
1551
1552 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1553 {
1554         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1555                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1556                                   skb->nh.iph->daddr, skb->csum)) {
1557                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1558                         return 0;
1559                 }
1560         }
1561
1562         skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
1563                                        skb->len, IPPROTO_TCP, 0);
1564
1565         if (skb->len <= 76) {
1566                 return __skb_checksum_complete(skb);
1567         }
1568         return 0;
1569 }
1570
1571
1572 /* The socket must have it's spinlock held when we get
1573  * here.
1574  *
1575  * We have a potential double-lock case here, so even when
1576  * doing backlog processing we use the BH locking scheme.
1577  * This is because we cannot sleep with the original spinlock
1578  * held.
1579  */
1580 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1581 {
1582         struct sock *rsk;
1583 #ifdef CONFIG_TCP_MD5SIG
1584         /*
1585          * We really want to reject the packet as early as possible
1586          * if:
1587          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1588          *  o There is an MD5 option and we're not expecting one
1589          */
1590         if (tcp_v4_inbound_md5_hash(sk, skb))
1591                 goto discard;
1592 #endif
1593
1594         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1595                 TCP_CHECK_TIMER(sk);
1596                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) {
1597                         rsk = sk;
1598                         goto reset;
1599                 }
1600                 TCP_CHECK_TIMER(sk);
1601                 return 0;
1602         }
1603
1604         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1605                 goto csum_err;
1606
1607         if (sk->sk_state == TCP_LISTEN) {
1608                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1609                 if (!nsk)
1610                         goto discard;
1611
1612                 if (nsk != sk) {
1613                         if (tcp_child_process(sk, nsk, skb)) {
1614                                 rsk = nsk;
1615                                 goto reset;
1616                         }
1617                         return 0;
1618                 }
1619         }
1620
1621         TCP_CHECK_TIMER(sk);
1622         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) {
1623                 rsk = sk;
1624                 goto reset;
1625         }
1626         TCP_CHECK_TIMER(sk);
1627         return 0;
1628
1629 reset:
1630         tcp_v4_send_reset(rsk, skb);
1631 discard:
1632         kfree_skb(skb);
1633         /* Be careful here. If this function gets more complicated and
1634          * gcc suffers from register pressure on the x86, sk (in %ebx)
1635          * might be destroyed here. This current version compiles correctly,
1636          * but you have been warned.
1637          */
1638         return 0;
1639
1640 csum_err:
1641         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1642         goto discard;
1643 }
1644
1645 /*
1646  *      From tcp_input.c
1647  */
1648
1649 int tcp_v4_rcv(struct sk_buff *skb)
1650 {
1651         struct tcphdr *th;
1652         struct sock *sk;
1653         int ret;
1654
1655         if (skb->pkt_type != PACKET_HOST)
1656                 goto discard_it;
1657
1658         /* Count it even if it's bad */
1659         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1660
1661         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1662                 goto discard_it;
1663
1664         th = skb->h.th;
1665
1666         if (th->doff < sizeof(struct tcphdr) / 4)
1667                 goto bad_packet;
1668         if (!pskb_may_pull(skb, th->doff * 4))
1669                 goto discard_it;
1670
1671         /* An explanation is required here, I think.
1672          * Packet length and doff are validated by header prediction,
1673          * provided case of th->doff==0 is eliminated.
1674          * So, we defer the checks. */
1675         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1676              tcp_v4_checksum_init(skb)))
1677                 goto bad_packet;
1678
1679         th = skb->h.th;
1680         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1681         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1682                                     skb->len - th->doff * 4);
1683         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1684         TCP_SKB_CB(skb)->when    = 0;
1685         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1686         TCP_SKB_CB(skb)->sacked  = 0;
1687
1688         sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1689                            skb->nh.iph->daddr, th->dest,
1690                            inet_iif(skb));
1691
1692         if (!sk)
1693                 goto no_tcp_socket;
1694
1695 process:
1696         if (sk->sk_state == TCP_TIME_WAIT)
1697                 goto do_time_wait;
1698
1699         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1700                 goto discard_and_relse;
1701         nf_reset(skb);
1702
1703         if (sk_filter(sk, skb))
1704                 goto discard_and_relse;
1705
1706         skb->dev = NULL;
1707
1708         bh_lock_sock_nested(sk);
1709         ret = 0;
1710         if (!sock_owned_by_user(sk)) {
1711 #ifdef CONFIG_NET_DMA
1712                 struct tcp_sock *tp = tcp_sk(sk);
1713                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1714                         tp->ucopy.dma_chan = get_softnet_dma();
1715                 if (tp->ucopy.dma_chan)
1716                         ret = tcp_v4_do_rcv(sk, skb);
1717                 else
1718 #endif
1719                 {
1720                         if (!tcp_prequeue(sk, skb))
1721                         ret = tcp_v4_do_rcv(sk, skb);
1722                 }
1723         } else
1724                 sk_add_backlog(sk, skb);
1725         bh_unlock_sock(sk);
1726
1727         sock_put(sk);
1728
1729         return ret;
1730
1731 no_tcp_socket:
1732         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1733                 goto discard_it;
1734
1735         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1736 bad_packet:
1737                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1738         } else {
1739                 tcp_v4_send_reset(NULL, skb);
1740         }
1741
1742 discard_it:
1743         /* Discard frame. */
1744         kfree_skb(skb);
1745         return 0;
1746
1747 discard_and_relse:
1748         sock_put(sk);
1749         goto discard_it;
1750
1751 do_time_wait:
1752         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1753                 inet_twsk_put(inet_twsk(sk));
1754                 goto discard_it;
1755         }
1756
1757         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1758                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1759                 inet_twsk_put(inet_twsk(sk));
1760                 goto discard_it;
1761         }
1762         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1763         case TCP_TW_SYN: {
1764                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1765                                                         skb->nh.iph->daddr,
1766                                                         th->dest,
1767                                                         inet_iif(skb));
1768                 if (sk2) {
1769                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1770                         inet_twsk_put(inet_twsk(sk));
1771                         sk = sk2;
1772                         goto process;
1773                 }
1774                 /* Fall through to ACK */
1775         }
1776         case TCP_TW_ACK:
1777                 tcp_v4_timewait_ack(sk, skb);
1778                 break;
1779         case TCP_TW_RST:
1780                 goto no_tcp_socket;
1781         case TCP_TW_SUCCESS:;
1782         }
1783         goto discard_it;
1784 }
1785
1786 /* VJ's idea. Save last timestamp seen from this destination
1787  * and hold it at least for normal timewait interval to use for duplicate
1788  * segment detection in subsequent connections, before they enter synchronized
1789  * state.
1790  */
1791
1792 int tcp_v4_remember_stamp(struct sock *sk)
1793 {
1794         struct inet_sock *inet = inet_sk(sk);
1795         struct tcp_sock *tp = tcp_sk(sk);
1796         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1797         struct inet_peer *peer = NULL;
1798         int release_it = 0;
1799
1800         if (!rt || rt->rt_dst != inet->daddr) {
1801                 peer = inet_getpeer(inet->daddr, 1);
1802                 release_it = 1;
1803         } else {
1804                 if (!rt->peer)
1805                         rt_bind_peer(rt, 1);
1806                 peer = rt->peer;
1807         }
1808
1809         if (peer) {
1810                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1811                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1812                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1813                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1814                         peer->tcp_ts = tp->rx_opt.ts_recent;
1815                 }
1816                 if (release_it)
1817                         inet_putpeer(peer);
1818                 return 1;
1819         }
1820
1821         return 0;
1822 }
1823
1824 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1825 {
1826         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1827
1828         if (peer) {
1829                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1830
1831                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1832                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1833                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1834                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1835                         peer->tcp_ts       = tcptw->tw_ts_recent;
1836                 }
1837                 inet_putpeer(peer);
1838                 return 1;
1839         }
1840
1841         return 0;
1842 }
1843
1844 struct inet_connection_sock_af_ops ipv4_specific = {
1845         .queue_xmit        = ip_queue_xmit,
1846         .send_check        = tcp_v4_send_check,
1847         .rebuild_header    = inet_sk_rebuild_header,
1848         .conn_request      = tcp_v4_conn_request,
1849         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1850         .remember_stamp    = tcp_v4_remember_stamp,
1851         .net_header_len    = sizeof(struct iphdr),
1852         .setsockopt        = ip_setsockopt,
1853         .getsockopt        = ip_getsockopt,
1854         .addr2sockaddr     = inet_csk_addr2sockaddr,
1855         .sockaddr_len      = sizeof(struct sockaddr_in),
1856 #ifdef CONFIG_COMPAT
1857         .compat_setsockopt = compat_ip_setsockopt,
1858         .compat_getsockopt = compat_ip_getsockopt,
1859 #endif
1860 };
1861
1862 struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1863 #ifdef CONFIG_TCP_MD5SIG
1864         .md5_lookup             = tcp_v4_md5_lookup,
1865         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1866         .md5_add                = tcp_v4_md5_add_func,
1867         .md5_parse              = tcp_v4_parse_md5_keys,
1868 #endif
1869 };
1870
1871 /* NOTE: A lot of things set to zero explicitly by call to
1872  *       sk_alloc() so need not be done here.
1873  */
1874 static int tcp_v4_init_sock(struct sock *sk)
1875 {
1876         struct inet_connection_sock *icsk = inet_csk(sk);
1877         struct tcp_sock *tp = tcp_sk(sk);
1878
1879         skb_queue_head_init(&tp->out_of_order_queue);
1880         tcp_init_xmit_timers(sk);
1881         tcp_prequeue_init(tp);
1882
1883         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1884         tp->mdev = TCP_TIMEOUT_INIT;
1885
1886         /* So many TCP implementations out there (incorrectly) count the
1887          * initial SYN frame in their delayed-ACK and congestion control
1888          * algorithms that we must have the following bandaid to talk
1889          * efficiently to them.  -DaveM
1890          */
1891         tp->snd_cwnd = 2;
1892
1893         /* See draft-stevens-tcpca-spec-01 for discussion of the
1894          * initialization of these values.
1895          */
1896         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1897         tp->snd_cwnd_clamp = ~0;
1898         tp->mss_cache = 536;
1899
1900         tp->reordering = sysctl_tcp_reordering;
1901         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1902
1903         sk->sk_state = TCP_CLOSE;
1904
1905         sk->sk_write_space = sk_stream_write_space;
1906         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1907
1908         icsk->icsk_af_ops = &ipv4_specific;
1909         icsk->icsk_sync_mss = tcp_sync_mss;
1910 #ifdef CONFIG_TCP_MD5SIG
1911         tp->af_specific = &tcp_sock_ipv4_specific;
1912 #endif
1913
1914         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1915         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1916
1917         atomic_inc(&tcp_sockets_allocated);
1918
1919         return 0;
1920 }
1921
1922 int tcp_v4_destroy_sock(struct sock *sk)
1923 {
1924         struct tcp_sock *tp = tcp_sk(sk);
1925
1926         tcp_clear_xmit_timers(sk);
1927
1928         tcp_cleanup_congestion_control(sk);
1929
1930         /* Cleanup up the write buffer. */
1931         sk_stream_writequeue_purge(sk);
1932
1933         /* Cleans up our, hopefully empty, out_of_order_queue. */
1934         __skb_queue_purge(&tp->out_of_order_queue);
1935
1936 #ifdef CONFIG_TCP_MD5SIG
1937         /* Clean up the MD5 key list, if any */
1938         if (tp->md5sig_info) {
1939                 tcp_v4_clear_md5_list(sk);
1940                 kfree(tp->md5sig_info);
1941                 tp->md5sig_info = NULL;
1942         }
1943 #endif
1944
1945 #ifdef CONFIG_NET_DMA
1946         /* Cleans up our sk_async_wait_queue */
1947         __skb_queue_purge(&sk->sk_async_wait_queue);
1948 #endif
1949
1950         /* Clean prequeue, it must be empty really */
1951         __skb_queue_purge(&tp->ucopy.prequeue);
1952
1953         /* Clean up a referenced TCP bind bucket. */
1954         if (inet_csk(sk)->icsk_bind_hash)
1955                 inet_put_port(&tcp_hashinfo, sk);
1956
1957         /*
1958          * If sendmsg cached page exists, toss it.
1959          */
1960         if (sk->sk_sndmsg_page) {
1961                 __free_page(sk->sk_sndmsg_page);
1962                 sk->sk_sndmsg_page = NULL;
1963         }
1964
1965         atomic_dec(&tcp_sockets_allocated);
1966
1967         return 0;
1968 }
1969
1970 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1971
1972 #ifdef CONFIG_PROC_FS
1973 /* Proc filesystem TCP sock list dumping. */
1974
1975 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1976 {
1977         return hlist_empty(head) ? NULL :
1978                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1979 }
1980
1981 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1982 {
1983         return tw->tw_node.next ?
1984                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1985 }
1986
1987 static void *listening_get_next(struct seq_file *seq, void *cur)
1988 {
1989         struct inet_connection_sock *icsk;
1990         struct hlist_node *node;
1991         struct sock *sk = cur;
1992         struct tcp_iter_state* st = seq->private;
1993
1994         if (!sk) {
1995                 st->bucket = 0;
1996                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1997                 goto get_sk;
1998         }
1999
2000         ++st->num;
2001
2002         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2003                 struct request_sock *req = cur;
2004
2005                 icsk = inet_csk(st->syn_wait_sk);
2006                 req = req->dl_next;
2007                 while (1) {
2008                         while (req) {
2009                                 if (req->rsk_ops->family == st->family) {
2010                                         cur = req;
2011                                         goto out;
2012                                 }
2013                                 req = req->dl_next;
2014                         }
2015                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2016                                 break;
2017 get_req:
2018                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2019                 }
2020                 sk        = sk_next(st->syn_wait_sk);
2021                 st->state = TCP_SEQ_STATE_LISTENING;
2022                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2023         } else {
2024                 icsk = inet_csk(sk);
2025                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2026                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2027                         goto start_req;
2028                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2029                 sk = sk_next(sk);
2030         }
2031 get_sk:
2032         sk_for_each_from(sk, node) {
2033                 if (sk->sk_family == st->family) {
2034                         cur = sk;
2035                         goto out;
2036                 }
2037                 icsk = inet_csk(sk);
2038                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2039                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2040 start_req:
2041                         st->uid         = sock_i_uid(sk);
2042                         st->syn_wait_sk = sk;
2043                         st->state       = TCP_SEQ_STATE_OPENREQ;
2044                         st->sbucket     = 0;
2045                         goto get_req;
2046                 }
2047                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2048         }
2049         if (++st->bucket < INET_LHTABLE_SIZE) {
2050                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2051                 goto get_sk;
2052         }
2053         cur = NULL;
2054 out:
2055         return cur;
2056 }
2057
2058 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2059 {
2060         void *rc = listening_get_next(seq, NULL);
2061
2062         while (rc && *pos) {
2063                 rc = listening_get_next(seq, rc);
2064                 --*pos;
2065         }
2066         return rc;
2067 }
2068
2069 static void *established_get_first(struct seq_file *seq)
2070 {
2071         struct tcp_iter_state* st = seq->private;
2072         void *rc = NULL;
2073
2074         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2075                 struct sock *sk;
2076                 struct hlist_node *node;
2077                 struct inet_timewait_sock *tw;
2078
2079                 /* We can reschedule _before_ having picked the target: */
2080                 cond_resched_softirq();
2081
2082                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2083                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2084                         if (sk->sk_family != st->family) {
2085                                 continue;
2086                         }
2087                         rc = sk;
2088                         goto out;
2089                 }
2090                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2091                 inet_twsk_for_each(tw, node,
2092                                    &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
2093                         if (tw->tw_family != st->family) {
2094                                 continue;
2095                         }
2096                         rc = tw;
2097                         goto out;
2098                 }
2099                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2100                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2101         }
2102 out:
2103         return rc;
2104 }
2105
2106 static void *established_get_next(struct seq_file *seq, void *cur)
2107 {
2108         struct sock *sk = cur;
2109         struct inet_timewait_sock *tw;
2110         struct hlist_node *node;
2111         struct tcp_iter_state* st = seq->private;
2112
2113         ++st->num;
2114
2115         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2116                 tw = cur;
2117                 tw = tw_next(tw);
2118 get_tw:
2119                 while (tw && tw->tw_family != st->family) {
2120                         tw = tw_next(tw);
2121                 }
2122                 if (tw) {
2123                         cur = tw;
2124                         goto out;
2125                 }
2126                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2127                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2128
2129                 /* We can reschedule between buckets: */
2130                 cond_resched_softirq();
2131
2132                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2133                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2134                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2135                 } else {
2136                         cur = NULL;
2137                         goto out;
2138                 }
2139         } else
2140                 sk = sk_next(sk);
2141
2142         sk_for_each_from(sk, node) {
2143                 if (sk->sk_family == st->family)
2144                         goto found;
2145         }
2146
2147         st->state = TCP_SEQ_STATE_TIME_WAIT;
2148         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
2149         goto get_tw;
2150 found:
2151         cur = sk;
2152 out:
2153         return cur;
2154 }
2155
2156 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2157 {
2158         void *rc = established_get_first(seq);
2159
2160         while (rc && pos) {
2161                 rc = established_get_next(seq, rc);
2162                 --pos;
2163         }
2164         return rc;
2165 }
2166
2167 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2168 {
2169         void *rc;
2170         struct tcp_iter_state* st = seq->private;
2171
2172         inet_listen_lock(&tcp_hashinfo);
2173         st->state = TCP_SEQ_STATE_LISTENING;
2174         rc        = listening_get_idx(seq, &pos);
2175
2176         if (!rc) {
2177                 inet_listen_unlock(&tcp_hashinfo);
2178                 local_bh_disable();
2179                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2180                 rc        = established_get_idx(seq, pos);
2181         }
2182
2183         return rc;
2184 }
2185
2186 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2187 {
2188         struct tcp_iter_state* st = seq->private;
2189         st->state = TCP_SEQ_STATE_LISTENING;
2190         st->num = 0;
2191         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2192 }
2193
2194 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2195 {
2196         void *rc = NULL;
2197         struct tcp_iter_state* st;
2198
2199         if (v == SEQ_START_TOKEN) {
2200                 rc = tcp_get_idx(seq, 0);
2201                 goto out;
2202         }
2203         st = seq->private;
2204
2205         switch (st->state) {
2206         case TCP_SEQ_STATE_OPENREQ:
2207         case TCP_SEQ_STATE_LISTENING:
2208                 rc = listening_get_next(seq, v);
2209                 if (!rc) {
2210                         inet_listen_unlock(&tcp_hashinfo);
2211                         local_bh_disable();
2212                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2213                         rc        = established_get_first(seq);
2214                 }
2215                 break;
2216         case TCP_SEQ_STATE_ESTABLISHED:
2217         case TCP_SEQ_STATE_TIME_WAIT:
2218                 rc = established_get_next(seq, v);
2219                 break;
2220         }
2221 out:
2222         ++*pos;
2223         return rc;
2224 }
2225
2226 static void tcp_seq_stop(struct seq_file *seq, void *v)
2227 {
2228         struct tcp_iter_state* st = seq->private;
2229
2230         switch (st->state) {
2231         case TCP_SEQ_STATE_OPENREQ:
2232                 if (v) {
2233                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2234                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2235                 }
2236         case TCP_SEQ_STATE_LISTENING:
2237                 if (v != SEQ_START_TOKEN)
2238                         inet_listen_unlock(&tcp_hashinfo);
2239                 break;
2240         case TCP_SEQ_STATE_TIME_WAIT:
2241         case TCP_SEQ_STATE_ESTABLISHED:
2242                 if (v)
2243                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2244                 local_bh_enable();
2245                 break;
2246         }
2247 }
2248
2249 static int tcp_seq_open(struct inode *inode, struct file *file)
2250 {
2251         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2252         struct seq_file *seq;
2253         struct tcp_iter_state *s;
2254         int rc;
2255
2256         if (unlikely(afinfo == NULL))
2257                 return -EINVAL;
2258
2259         s = kzalloc(sizeof(*s), GFP_KERNEL);
2260         if (!s)
2261                 return -ENOMEM;
2262         s->family               = afinfo->family;
2263         s->seq_ops.start        = tcp_seq_start;
2264         s->seq_ops.next         = tcp_seq_next;
2265         s->seq_ops.show         = afinfo->seq_show;
2266         s->seq_ops.stop         = tcp_seq_stop;
2267
2268         rc = seq_open(file, &s->seq_ops);
2269         if (rc)
2270                 goto out_kfree;
2271         seq          = file->private_data;
2272         seq->private = s;
2273 out:
2274         return rc;
2275 out_kfree:
2276         kfree(s);
2277         goto out;
2278 }
2279
2280 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2281 {
2282         int rc = 0;
2283         struct proc_dir_entry *p;
2284
2285         if (!afinfo)
2286                 return -EINVAL;
2287         afinfo->seq_fops->owner         = afinfo->owner;
2288         afinfo->seq_fops->open          = tcp_seq_open;
2289         afinfo->seq_fops->read          = seq_read;
2290         afinfo->seq_fops->llseek        = seq_lseek;
2291         afinfo->seq_fops->release       = seq_release_private;
2292
2293         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2294         if (p)
2295                 p->data = afinfo;
2296         else
2297                 rc = -ENOMEM;
2298         return rc;
2299 }
2300
2301 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2302 {
2303         if (!afinfo)
2304                 return;
2305         proc_net_remove(afinfo->name);
2306         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2307 }
2308
2309 static void get_openreq4(struct sock *sk, struct request_sock *req,
2310                          char *tmpbuf, int i, int uid)
2311 {
2312         const struct inet_request_sock *ireq = inet_rsk(req);
2313         int ttd = req->expires - jiffies;
2314
2315         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2316                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2317                 i,
2318                 ireq->loc_addr,
2319                 ntohs(inet_sk(sk)->sport),
2320                 ireq->rmt_addr,
2321                 ntohs(ireq->rmt_port),
2322                 TCP_SYN_RECV,
2323                 0, 0, /* could print option size, but that is af dependent. */
2324                 1,    /* timers active (only the expire timer) */
2325                 jiffies_to_clock_t(ttd),
2326                 req->retrans,
2327                 uid,
2328                 0,  /* non standard timer */
2329                 0, /* open_requests have no inode */
2330                 atomic_read(&sk->sk_refcnt),
2331                 req);
2332 }
2333
2334 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2335 {
2336         int timer_active;
2337         unsigned long timer_expires;
2338         struct tcp_sock *tp = tcp_sk(sp);
2339         const struct inet_connection_sock *icsk = inet_csk(sp);
2340         struct inet_sock *inet = inet_sk(sp);
2341         __be32 dest = inet->daddr;
2342         __be32 src = inet->rcv_saddr;
2343         __u16 destp = ntohs(inet->dport);
2344         __u16 srcp = ntohs(inet->sport);
2345
2346         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2347                 timer_active    = 1;
2348                 timer_expires   = icsk->icsk_timeout;
2349         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2350                 timer_active    = 4;
2351                 timer_expires   = icsk->icsk_timeout;
2352         } else if (timer_pending(&sp->sk_timer)) {
2353                 timer_active    = 2;
2354                 timer_expires   = sp->sk_timer.expires;
2355         } else {
2356                 timer_active    = 0;
2357                 timer_expires = jiffies;
2358         }
2359
2360         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2361                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2362                 i, src, srcp, dest, destp, sp->sk_state,
2363                 tp->write_seq - tp->snd_una,
2364                 sp->sk_state == TCP_LISTEN ? sp->sk_ack_backlog :
2365                                              (tp->rcv_nxt - tp->copied_seq),
2366                 timer_active,
2367                 jiffies_to_clock_t(timer_expires - jiffies),
2368                 icsk->icsk_retransmits,
2369                 sock_i_uid(sp),
2370                 icsk->icsk_probes_out,
2371                 sock_i_ino(sp),
2372                 atomic_read(&sp->sk_refcnt), sp,
2373                 icsk->icsk_rto,
2374                 icsk->icsk_ack.ato,
2375                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2376                 tp->snd_cwnd,
2377                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2378 }
2379
2380 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2381                                char *tmpbuf, int i)
2382 {
2383         __be32 dest, src;
2384         __u16 destp, srcp;
2385         int ttd = tw->tw_ttd - jiffies;
2386
2387         if (ttd < 0)
2388                 ttd = 0;
2389
2390         dest  = tw->tw_daddr;
2391         src   = tw->tw_rcv_saddr;
2392         destp = ntohs(tw->tw_dport);
2393         srcp  = ntohs(tw->tw_sport);
2394
2395         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2396                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2397                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2398                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2399                 atomic_read(&tw->tw_refcnt), tw);
2400 }
2401
2402 #define TMPSZ 150
2403
2404 static int tcp4_seq_show(struct seq_file *seq, void *v)
2405 {
2406         struct tcp_iter_state* st;
2407         char tmpbuf[TMPSZ + 1];
2408
2409         if (v == SEQ_START_TOKEN) {
2410                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2411                            "  sl  local_address rem_address   st tx_queue "
2412                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2413                            "inode");
2414                 goto out;
2415         }
2416         st = seq->private;
2417
2418         switch (st->state) {
2419         case TCP_SEQ_STATE_LISTENING:
2420         case TCP_SEQ_STATE_ESTABLISHED:
2421                 get_tcp4_sock(v, tmpbuf, st->num);
2422                 break;
2423         case TCP_SEQ_STATE_OPENREQ:
2424                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2425                 break;
2426         case TCP_SEQ_STATE_TIME_WAIT:
2427                 get_timewait4_sock(v, tmpbuf, st->num);
2428                 break;
2429         }
2430         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2431 out:
2432         return 0;
2433 }
2434
2435 static struct file_operations tcp4_seq_fops;
2436 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2437         .owner          = THIS_MODULE,
2438         .name           = "tcp",
2439         .family         = AF_INET,
2440         .seq_show       = tcp4_seq_show,
2441         .seq_fops       = &tcp4_seq_fops,
2442 };
2443
2444 int __init tcp4_proc_init(void)
2445 {
2446         return tcp_proc_register(&tcp4_seq_afinfo);
2447 }
2448
2449 void tcp4_proc_exit(void)
2450 {
2451         tcp_proc_unregister(&tcp4_seq_afinfo);
2452 }
2453 #endif /* CONFIG_PROC_FS */
2454
2455 struct proto tcp_prot = {
2456         .name                   = "TCP",
2457         .owner                  = THIS_MODULE,
2458         .close                  = tcp_close,
2459         .connect                = tcp_v4_connect,
2460         .disconnect             = tcp_disconnect,
2461         .accept                 = inet_csk_accept,
2462         .ioctl                  = tcp_ioctl,
2463         .init                   = tcp_v4_init_sock,
2464         .destroy                = tcp_v4_destroy_sock,
2465         .shutdown               = tcp_shutdown,
2466         .setsockopt             = tcp_setsockopt,
2467         .getsockopt             = tcp_getsockopt,
2468         .sendmsg                = tcp_sendmsg,
2469         .recvmsg                = tcp_recvmsg,
2470         .backlog_rcv            = tcp_v4_do_rcv,
2471         .hash                   = tcp_v4_hash,
2472         .unhash                 = tcp_unhash,
2473         .get_port               = tcp_v4_get_port,
2474         .enter_memory_pressure  = tcp_enter_memory_pressure,
2475         .sockets_allocated      = &tcp_sockets_allocated,
2476         .orphan_count           = &tcp_orphan_count,
2477         .memory_allocated       = &tcp_memory_allocated,
2478         .memory_pressure        = &tcp_memory_pressure,
2479         .sysctl_mem             = sysctl_tcp_mem,
2480         .sysctl_wmem            = sysctl_tcp_wmem,
2481         .sysctl_rmem            = sysctl_tcp_rmem,
2482         .max_header             = MAX_TCP_HEADER,
2483         .obj_size               = sizeof(struct tcp_sock),
2484         .twsk_prot              = &tcp_timewait_sock_ops,
2485         .rsk_prot               = &tcp_request_sock_ops,
2486 #ifdef CONFIG_COMPAT
2487         .compat_setsockopt      = compat_tcp_setsockopt,
2488         .compat_getsockopt      = compat_tcp_getsockopt,
2489 #endif
2490 };
2491
2492 void __init tcp_v4_init(struct net_proto_family *ops)
2493 {
2494         if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2495                                      IPPROTO_TCP) < 0)
2496                 panic("Failed to create the TCP control socket.\n");
2497 }
2498
2499 EXPORT_SYMBOL(ipv4_specific);
2500 EXPORT_SYMBOL(tcp_hashinfo);
2501 EXPORT_SYMBOL(tcp_prot);
2502 EXPORT_SYMBOL(tcp_unhash);
2503 EXPORT_SYMBOL(tcp_v4_conn_request);
2504 EXPORT_SYMBOL(tcp_v4_connect);
2505 EXPORT_SYMBOL(tcp_v4_do_rcv);
2506 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2507 EXPORT_SYMBOL(tcp_v4_send_check);
2508 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2509
2510 #ifdef CONFIG_PROC_FS
2511 EXPORT_SYMBOL(tcp_proc_register);
2512 EXPORT_SYMBOL(tcp_proc_unregister);
2513 #endif
2514 EXPORT_SYMBOL(sysctl_local_port_range);
2515 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2516