[INET]: Generalise tcp_v4_lookup_listener
[safe/jmp/linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen sematics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55 #include <linux/config.h>
56
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
65
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/xfrm.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
82
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
85
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
88
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90                        struct sk_buff *skb);
91
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93         .lhash_lock     = RW_LOCK_UNLOCKED,
94         .lhash_users    = ATOMIC_INIT(0),
95         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96         .portalloc_lock = SPIN_LOCK_UNLOCKED,
97         .port_rover     = 1024 - 1,
98 };
99
100 /*
101  * This array holds the first and last local port number.
102  * For high-usage systems, use sysctl to change this to
103  * 32768-61000
104  */
105 int sysctl_local_port_range[2] = { 1024, 4999 };
106
107 static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
108 {
109         const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
110         struct sock *sk2;
111         struct hlist_node *node;
112         int reuse = sk->sk_reuse;
113
114         sk_for_each_bound(sk2, node, &tb->owners) {
115                 if (sk != sk2 &&
116                     !tcp_v6_ipv6only(sk2) &&
117                     (!sk->sk_bound_dev_if ||
118                      !sk2->sk_bound_dev_if ||
119                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120                         if (!reuse || !sk2->sk_reuse ||
121                             sk2->sk_state == TCP_LISTEN) {
122                                 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
123                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124                                     sk2_rcv_saddr == sk_rcv_saddr)
125                                         break;
126                         }
127                 }
128         }
129         return node != NULL;
130 }
131
132 /* Obtain a reference to a local port for the given sock,
133  * if snum is zero it means select any available local port.
134  */
135 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
136 {
137         struct inet_bind_hashbucket *head;
138         struct hlist_node *node;
139         struct inet_bind_bucket *tb;
140         int ret;
141
142         local_bh_disable();
143         if (!snum) {
144                 int low = sysctl_local_port_range[0];
145                 int high = sysctl_local_port_range[1];
146                 int remaining = (high - low) + 1;
147                 int rover;
148
149                 spin_lock(&tcp_hashinfo.portalloc_lock);
150                 if (tcp_hashinfo.port_rover < low)
151                         rover = low;
152                 else
153                         rover = tcp_hashinfo.port_rover;
154                 do {
155                         rover++;
156                         if (rover > high)
157                                 rover = low;
158                         head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
159                         spin_lock(&head->lock);
160                         inet_bind_bucket_for_each(tb, node, &head->chain)
161                                 if (tb->port == rover)
162                                         goto next;
163                         break;
164                 next:
165                         spin_unlock(&head->lock);
166                 } while (--remaining > 0);
167                 tcp_hashinfo.port_rover = rover;
168                 spin_unlock(&tcp_hashinfo.portalloc_lock);
169
170                 /* Exhausted local port range during search?  It is not
171                  * possible for us to be holding one of the bind hash
172                  * locks if this test triggers, because if 'remaining'
173                  * drops to zero, we broke out of the do/while loop at
174                  * the top level, not from the 'break;' statement.
175                  */
176                 ret = 1;
177                 if (unlikely(remaining <= 0))
178                         goto fail;
179
180                 /* OK, here is the one we will use.  HEAD is
181                  * non-NULL and we hold it's mutex.
182                  */
183                 snum = rover;
184         } else {
185                 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
186                 spin_lock(&head->lock);
187                 inet_bind_bucket_for_each(tb, node, &head->chain)
188                         if (tb->port == snum)
189                                 goto tb_found;
190         }
191         tb = NULL;
192         goto tb_not_found;
193 tb_found:
194         if (!hlist_empty(&tb->owners)) {
195                 if (sk->sk_reuse > 1)
196                         goto success;
197                 if (tb->fastreuse > 0 &&
198                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
199                         goto success;
200                 } else {
201                         ret = 1;
202                         if (tcp_bind_conflict(sk, tb))
203                                 goto fail_unlock;
204                 }
205         }
206 tb_not_found:
207         ret = 1;
208         if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL)
209                 goto fail_unlock;
210         if (hlist_empty(&tb->owners)) {
211                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
212                         tb->fastreuse = 1;
213                 else
214                         tb->fastreuse = 0;
215         } else if (tb->fastreuse &&
216                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
217                 tb->fastreuse = 0;
218 success:
219         if (!inet_sk(sk)->bind_hash)
220                 inet_bind_hash(sk, tb, snum);
221         BUG_TRAP(inet_sk(sk)->bind_hash == tb);
222         ret = 0;
223
224 fail_unlock:
225         spin_unlock(&head->lock);
226 fail:
227         local_bh_enable();
228         return ret;
229 }
230
231 static void tcp_v4_hash(struct sock *sk)
232 {
233         inet_hash(&tcp_hashinfo, sk);
234 }
235
236 void tcp_unhash(struct sock *sk)
237 {
238         inet_unhash(&tcp_hashinfo, sk);
239 }
240
241 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
242  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
243  *
244  * Local BH must be disabled here.
245  */
246
247 static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
248                                                        const u16 sport,
249                                                        const u32 daddr,
250                                                        const u16 hnum,
251                                                        const int dif)
252 {
253         struct inet_ehash_bucket *head;
254         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
255         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
256         struct sock *sk;
257         struct hlist_node *node;
258         /* Optimize here for direct hit, only listening connections can
259          * have wildcards anyways.
260          */
261         const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_hashinfo.ehash_size);
262         head = &tcp_hashinfo.ehash[hash];
263         read_lock(&head->lock);
264         sk_for_each(sk, node, &head->chain) {
265                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
266                         goto hit; /* You sunk my battleship! */
267         }
268
269         /* Must check for a TIME_WAIT'er before going to listener hash. */
270         sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) {
271                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
272                         goto hit;
273         }
274         sk = NULL;
275 out:
276         read_unlock(&head->lock);
277         return sk;
278 hit:
279         sock_hold(sk);
280         goto out;
281 }
282
283 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
284                                            u32 daddr, u16 hnum, int dif)
285 {
286         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
287                                                       daddr, hnum, dif);
288
289         return sk ? : inet_lookup_listener(&tcp_hashinfo, daddr, hnum, dif);
290 }
291
292 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
293                                   u16 dport, int dif)
294 {
295         struct sock *sk;
296
297         local_bh_disable();
298         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
299         local_bh_enable();
300
301         return sk;
302 }
303
304 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
305
306 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
307 {
308         return secure_tcp_sequence_number(skb->nh.iph->daddr,
309                                           skb->nh.iph->saddr,
310                                           skb->h.th->dest,
311                                           skb->h.th->source);
312 }
313
314 /* called with local bh disabled */
315 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
316                                       struct tcp_tw_bucket **twp)
317 {
318         struct inet_sock *inet = inet_sk(sk);
319         u32 daddr = inet->rcv_saddr;
320         u32 saddr = inet->daddr;
321         int dif = sk->sk_bound_dev_if;
322         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
323         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
324         const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
325         struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
326         struct sock *sk2;
327         struct hlist_node *node;
328         struct tcp_tw_bucket *tw;
329
330         write_lock(&head->lock);
331
332         /* Check TIME-WAIT sockets first. */
333         sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
334                 tw = (struct tcp_tw_bucket *)sk2;
335
336                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
337                         struct tcp_sock *tp = tcp_sk(sk);
338
339                         /* With PAWS, it is safe from the viewpoint
340                            of data integrity. Even without PAWS it
341                            is safe provided sequence spaces do not
342                            overlap i.e. at data rates <= 80Mbit/sec.
343
344                            Actually, the idea is close to VJ's one,
345                            only timestamp cache is held not per host,
346                            but per port pair and TW bucket is used
347                            as state holder.
348
349                            If TW bucket has been already destroyed we
350                            fall back to VJ's scheme and use initial
351                            timestamp retrieved from peer table.
352                          */
353                         if (tw->tw_ts_recent_stamp &&
354                             (!twp || (sysctl_tcp_tw_reuse &&
355                                       xtime.tv_sec -
356                                       tw->tw_ts_recent_stamp > 1))) {
357                                 if ((tp->write_seq =
358                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
359                                         tp->write_seq = 1;
360                                 tp->rx_opt.ts_recent       = tw->tw_ts_recent;
361                                 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
362                                 sock_hold(sk2);
363                                 goto unique;
364                         } else
365                                 goto not_unique;
366                 }
367         }
368         tw = NULL;
369
370         /* And established part... */
371         sk_for_each(sk2, node, &head->chain) {
372                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
373                         goto not_unique;
374         }
375
376 unique:
377         /* Must record num and sport now. Otherwise we will see
378          * in hash table socket with a funny identity. */
379         inet->num = lport;
380         inet->sport = htons(lport);
381         sk->sk_hashent = hash;
382         BUG_TRAP(sk_unhashed(sk));
383         __sk_add_node(sk, &head->chain);
384         sock_prot_inc_use(sk->sk_prot);
385         write_unlock(&head->lock);
386
387         if (twp) {
388                 *twp = tw;
389                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
390         } else if (tw) {
391                 /* Silly. Should hash-dance instead... */
392                 tcp_tw_deschedule(tw);
393                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
394
395                 tcp_tw_put(tw);
396         }
397
398         return 0;
399
400 not_unique:
401         write_unlock(&head->lock);
402         return -EADDRNOTAVAIL;
403 }
404
405 static inline u32 connect_port_offset(const struct sock *sk)
406 {
407         const struct inet_sock *inet = inet_sk(sk);
408
409         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, 
410                                          inet->dport);
411 }
412
413 /*
414  * Bind a port for a connect operation and hash it.
415  */
416 static inline int tcp_v4_hash_connect(struct sock *sk)
417 {
418         const unsigned short snum = inet_sk(sk)->num;
419         struct inet_bind_hashbucket *head;
420         struct inet_bind_bucket *tb;
421         int ret;
422
423         if (!snum) {
424                 int low = sysctl_local_port_range[0];
425                 int high = sysctl_local_port_range[1];
426                 int range = high - low;
427                 int i;
428                 int port;
429                 static u32 hint;
430                 u32 offset = hint + connect_port_offset(sk);
431                 struct hlist_node *node;
432                 struct tcp_tw_bucket *tw = NULL;
433
434                 local_bh_disable();
435                 for (i = 1; i <= range; i++) {
436                         port = low + (i + offset) % range;
437                         head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
438                         spin_lock(&head->lock);
439
440                         /* Does not bother with rcv_saddr checks,
441                          * because the established check is already
442                          * unique enough.
443                          */
444                         inet_bind_bucket_for_each(tb, node, &head->chain) {
445                                 if (tb->port == port) {
446                                         BUG_TRAP(!hlist_empty(&tb->owners));
447                                         if (tb->fastreuse >= 0)
448                                                 goto next_port;
449                                         if (!__tcp_v4_check_established(sk,
450                                                                         port,
451                                                                         &tw))
452                                                 goto ok;
453                                         goto next_port;
454                                 }
455                         }
456
457                         tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
458                         if (!tb) {
459                                 spin_unlock(&head->lock);
460                                 break;
461                         }
462                         tb->fastreuse = -1;
463                         goto ok;
464
465                 next_port:
466                         spin_unlock(&head->lock);
467                 }
468                 local_bh_enable();
469
470                 return -EADDRNOTAVAIL;
471
472 ok:
473                 hint += i;
474
475                 /* Head lock still held and bh's disabled */
476                 inet_bind_hash(sk, tb, port);
477                 if (sk_unhashed(sk)) {
478                         inet_sk(sk)->sport = htons(port);
479                         __inet_hash(&tcp_hashinfo, sk, 0);
480                 }
481                 spin_unlock(&head->lock);
482
483                 if (tw) {
484                         tcp_tw_deschedule(tw);
485                         tcp_tw_put(tw);
486                 }
487
488                 ret = 0;
489                 goto out;
490         }
491
492         head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
493         tb  = inet_sk(sk)->bind_hash;
494         spin_lock_bh(&head->lock);
495         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
496                 __inet_hash(&tcp_hashinfo, sk, 0);
497                 spin_unlock_bh(&head->lock);
498                 return 0;
499         } else {
500                 spin_unlock(&head->lock);
501                 /* No definite answer... Walk to established hash table */
502                 ret = __tcp_v4_check_established(sk, snum, NULL);
503 out:
504                 local_bh_enable();
505                 return ret;
506         }
507 }
508
509 /* This will initiate an outgoing connection. */
510 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
511 {
512         struct inet_sock *inet = inet_sk(sk);
513         struct tcp_sock *tp = tcp_sk(sk);
514         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
515         struct rtable *rt;
516         u32 daddr, nexthop;
517         int tmp;
518         int err;
519
520         if (addr_len < sizeof(struct sockaddr_in))
521                 return -EINVAL;
522
523         if (usin->sin_family != AF_INET)
524                 return -EAFNOSUPPORT;
525
526         nexthop = daddr = usin->sin_addr.s_addr;
527         if (inet->opt && inet->opt->srr) {
528                 if (!daddr)
529                         return -EINVAL;
530                 nexthop = inet->opt->faddr;
531         }
532
533         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
534                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
535                                IPPROTO_TCP,
536                                inet->sport, usin->sin_port, sk);
537         if (tmp < 0)
538                 return tmp;
539
540         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
541                 ip_rt_put(rt);
542                 return -ENETUNREACH;
543         }
544
545         if (!inet->opt || !inet->opt->srr)
546                 daddr = rt->rt_dst;
547
548         if (!inet->saddr)
549                 inet->saddr = rt->rt_src;
550         inet->rcv_saddr = inet->saddr;
551
552         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
553                 /* Reset inherited state */
554                 tp->rx_opt.ts_recent       = 0;
555                 tp->rx_opt.ts_recent_stamp = 0;
556                 tp->write_seq              = 0;
557         }
558
559         if (sysctl_tcp_tw_recycle &&
560             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
561                 struct inet_peer *peer = rt_get_peer(rt);
562
563                 /* VJ's idea. We save last timestamp seen from
564                  * the destination in peer table, when entering state TIME-WAIT
565                  * and initialize rx_opt.ts_recent from it, when trying new connection.
566                  */
567
568                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
569                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
570                         tp->rx_opt.ts_recent = peer->tcp_ts;
571                 }
572         }
573
574         inet->dport = usin->sin_port;
575         inet->daddr = daddr;
576
577         tp->ext_header_len = 0;
578         if (inet->opt)
579                 tp->ext_header_len = inet->opt->optlen;
580
581         tp->rx_opt.mss_clamp = 536;
582
583         /* Socket identity is still unknown (sport may be zero).
584          * However we set state to SYN-SENT and not releasing socket
585          * lock select source port, enter ourselves into the hash tables and
586          * complete initialization after this.
587          */
588         tcp_set_state(sk, TCP_SYN_SENT);
589         err = tcp_v4_hash_connect(sk);
590         if (err)
591                 goto failure;
592
593         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
594         if (err)
595                 goto failure;
596
597         /* OK, now commit destination to socket.  */
598         sk_setup_caps(sk, &rt->u.dst);
599
600         if (!tp->write_seq)
601                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
602                                                            inet->daddr,
603                                                            inet->sport,
604                                                            usin->sin_port);
605
606         inet->id = tp->write_seq ^ jiffies;
607
608         err = tcp_connect(sk);
609         rt = NULL;
610         if (err)
611                 goto failure;
612
613         return 0;
614
615 failure:
616         /* This unhashes the socket and releases the local port, if necessary. */
617         tcp_set_state(sk, TCP_CLOSE);
618         ip_rt_put(rt);
619         sk->sk_route_caps = 0;
620         inet->dport = 0;
621         return err;
622 }
623
624 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
625 {
626         return ((struct rtable *)skb->dst)->rt_iif;
627 }
628
629 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
630 {
631         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
632 }
633
634 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
635                                               struct request_sock ***prevp,
636                                               __u16 rport,
637                                               __u32 raddr, __u32 laddr)
638 {
639         struct listen_sock *lopt = tp->accept_queue.listen_opt;
640         struct request_sock *req, **prev;
641
642         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
643              (req = *prev) != NULL;
644              prev = &req->dl_next) {
645                 const struct inet_request_sock *ireq = inet_rsk(req);
646
647                 if (ireq->rmt_port == rport &&
648                     ireq->rmt_addr == raddr &&
649                     ireq->loc_addr == laddr &&
650                     TCP_INET_FAMILY(req->rsk_ops->family)) {
651                         BUG_TRAP(!req->sk);
652                         *prevp = prev;
653                         break;
654                 }
655         }
656
657         return req;
658 }
659
660 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
661 {
662         struct tcp_sock *tp = tcp_sk(sk);
663         struct listen_sock *lopt = tp->accept_queue.listen_opt;
664         u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
665
666         reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
667         tcp_synq_added(sk);
668 }
669
670
671 /*
672  * This routine does path mtu discovery as defined in RFC1191.
673  */
674 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
675                                      u32 mtu)
676 {
677         struct dst_entry *dst;
678         struct inet_sock *inet = inet_sk(sk);
679         struct tcp_sock *tp = tcp_sk(sk);
680
681         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
682          * send out by Linux are always <576bytes so they should go through
683          * unfragmented).
684          */
685         if (sk->sk_state == TCP_LISTEN)
686                 return;
687
688         /* We don't check in the destentry if pmtu discovery is forbidden
689          * on this route. We just assume that no packet_to_big packets
690          * are send back when pmtu discovery is not active.
691          * There is a small race when the user changes this flag in the
692          * route, but I think that's acceptable.
693          */
694         if ((dst = __sk_dst_check(sk, 0)) == NULL)
695                 return;
696
697         dst->ops->update_pmtu(dst, mtu);
698
699         /* Something is about to be wrong... Remember soft error
700          * for the case, if this connection will not able to recover.
701          */
702         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
703                 sk->sk_err_soft = EMSGSIZE;
704
705         mtu = dst_mtu(dst);
706
707         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
708             tp->pmtu_cookie > mtu) {
709                 tcp_sync_mss(sk, mtu);
710
711                 /* Resend the TCP packet because it's
712                  * clear that the old packet has been
713                  * dropped. This is the new "fast" path mtu
714                  * discovery.
715                  */
716                 tcp_simple_retransmit(sk);
717         } /* else let the usual retransmit timer handle it */
718 }
719
720 /*
721  * This routine is called by the ICMP module when it gets some
722  * sort of error condition.  If err < 0 then the socket should
723  * be closed and the error returned to the user.  If err > 0
724  * it's just the icmp type << 8 | icmp code.  After adjustment
725  * header points to the first 8 bytes of the tcp header.  We need
726  * to find the appropriate port.
727  *
728  * The locking strategy used here is very "optimistic". When
729  * someone else accesses the socket the ICMP is just dropped
730  * and for some paths there is no check at all.
731  * A more general error queue to queue errors for later handling
732  * is probably better.
733  *
734  */
735
736 void tcp_v4_err(struct sk_buff *skb, u32 info)
737 {
738         struct iphdr *iph = (struct iphdr *)skb->data;
739         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
740         struct tcp_sock *tp;
741         struct inet_sock *inet;
742         int type = skb->h.icmph->type;
743         int code = skb->h.icmph->code;
744         struct sock *sk;
745         __u32 seq;
746         int err;
747
748         if (skb->len < (iph->ihl << 2) + 8) {
749                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
750                 return;
751         }
752
753         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
754                            th->source, tcp_v4_iif(skb));
755         if (!sk) {
756                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
757                 return;
758         }
759         if (sk->sk_state == TCP_TIME_WAIT) {
760                 tcp_tw_put((struct tcp_tw_bucket *)sk);
761                 return;
762         }
763
764         bh_lock_sock(sk);
765         /* If too many ICMPs get dropped on busy
766          * servers this needs to be solved differently.
767          */
768         if (sock_owned_by_user(sk))
769                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
770
771         if (sk->sk_state == TCP_CLOSE)
772                 goto out;
773
774         tp = tcp_sk(sk);
775         seq = ntohl(th->seq);
776         if (sk->sk_state != TCP_LISTEN &&
777             !between(seq, tp->snd_una, tp->snd_nxt)) {
778                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
779                 goto out;
780         }
781
782         switch (type) {
783         case ICMP_SOURCE_QUENCH:
784                 /* Just silently ignore these. */
785                 goto out;
786         case ICMP_PARAMETERPROB:
787                 err = EPROTO;
788                 break;
789         case ICMP_DEST_UNREACH:
790                 if (code > NR_ICMP_UNREACH)
791                         goto out;
792
793                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
794                         if (!sock_owned_by_user(sk))
795                                 do_pmtu_discovery(sk, iph, info);
796                         goto out;
797                 }
798
799                 err = icmp_err_convert[code].errno;
800                 break;
801         case ICMP_TIME_EXCEEDED:
802                 err = EHOSTUNREACH;
803                 break;
804         default:
805                 goto out;
806         }
807
808         switch (sk->sk_state) {
809                 struct request_sock *req, **prev;
810         case TCP_LISTEN:
811                 if (sock_owned_by_user(sk))
812                         goto out;
813
814                 req = tcp_v4_search_req(tp, &prev, th->dest,
815                                         iph->daddr, iph->saddr);
816                 if (!req)
817                         goto out;
818
819                 /* ICMPs are not backlogged, hence we cannot get
820                    an established socket here.
821                  */
822                 BUG_TRAP(!req->sk);
823
824                 if (seq != tcp_rsk(req)->snt_isn) {
825                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
826                         goto out;
827                 }
828
829                 /*
830                  * Still in SYN_RECV, just remove it silently.
831                  * There is no good way to pass the error to the newly
832                  * created socket, and POSIX does not want network
833                  * errors returned from accept().
834                  */
835                 tcp_synq_drop(sk, req, prev);
836                 goto out;
837
838         case TCP_SYN_SENT:
839         case TCP_SYN_RECV:  /* Cannot happen.
840                                It can f.e. if SYNs crossed.
841                              */
842                 if (!sock_owned_by_user(sk)) {
843                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
844                         sk->sk_err = err;
845
846                         sk->sk_error_report(sk);
847
848                         tcp_done(sk);
849                 } else {
850                         sk->sk_err_soft = err;
851                 }
852                 goto out;
853         }
854
855         /* If we've already connected we will keep trying
856          * until we time out, or the user gives up.
857          *
858          * rfc1122 4.2.3.9 allows to consider as hard errors
859          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
860          * but it is obsoleted by pmtu discovery).
861          *
862          * Note, that in modern internet, where routing is unreliable
863          * and in each dark corner broken firewalls sit, sending random
864          * errors ordered by their masters even this two messages finally lose
865          * their original sense (even Linux sends invalid PORT_UNREACHs)
866          *
867          * Now we are in compliance with RFCs.
868          *                                                      --ANK (980905)
869          */
870
871         inet = inet_sk(sk);
872         if (!sock_owned_by_user(sk) && inet->recverr) {
873                 sk->sk_err = err;
874                 sk->sk_error_report(sk);
875         } else  { /* Only an error on timeout */
876                 sk->sk_err_soft = err;
877         }
878
879 out:
880         bh_unlock_sock(sk);
881         sock_put(sk);
882 }
883
884 /* This routine computes an IPv4 TCP checksum. */
885 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
886                        struct sk_buff *skb)
887 {
888         struct inet_sock *inet = inet_sk(sk);
889
890         if (skb->ip_summed == CHECKSUM_HW) {
891                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
892                 skb->csum = offsetof(struct tcphdr, check);
893         } else {
894                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
895                                          csum_partial((char *)th,
896                                                       th->doff << 2,
897                                                       skb->csum));
898         }
899 }
900
901 /*
902  *      This routine will send an RST to the other tcp.
903  *
904  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
905  *                    for reset.
906  *      Answer: if a packet caused RST, it is not for a socket
907  *              existing in our system, if it is matched to a socket,
908  *              it is just duplicate segment or bug in other side's TCP.
909  *              So that we build reply only basing on parameters
910  *              arrived with segment.
911  *      Exception: precedence violation. We do not implement it in any case.
912  */
913
914 static void tcp_v4_send_reset(struct sk_buff *skb)
915 {
916         struct tcphdr *th = skb->h.th;
917         struct tcphdr rth;
918         struct ip_reply_arg arg;
919
920         /* Never send a reset in response to a reset. */
921         if (th->rst)
922                 return;
923
924         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
925                 return;
926
927         /* Swap the send and the receive. */
928         memset(&rth, 0, sizeof(struct tcphdr));
929         rth.dest   = th->source;
930         rth.source = th->dest;
931         rth.doff   = sizeof(struct tcphdr) / 4;
932         rth.rst    = 1;
933
934         if (th->ack) {
935                 rth.seq = th->ack_seq;
936         } else {
937                 rth.ack = 1;
938                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
939                                     skb->len - (th->doff << 2));
940         }
941
942         memset(&arg, 0, sizeof arg);
943         arg.iov[0].iov_base = (unsigned char *)&rth;
944         arg.iov[0].iov_len  = sizeof rth;
945         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
946                                       skb->nh.iph->saddr, /*XXX*/
947                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
948         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
949
950         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
951
952         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
953         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
954 }
955
956 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
957    outside socket context is ugly, certainly. What can I do?
958  */
959
960 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
961                             u32 win, u32 ts)
962 {
963         struct tcphdr *th = skb->h.th;
964         struct {
965                 struct tcphdr th;
966                 u32 tsopt[3];
967         } rep;
968         struct ip_reply_arg arg;
969
970         memset(&rep.th, 0, sizeof(struct tcphdr));
971         memset(&arg, 0, sizeof arg);
972
973         arg.iov[0].iov_base = (unsigned char *)&rep;
974         arg.iov[0].iov_len  = sizeof(rep.th);
975         if (ts) {
976                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
977                                      (TCPOPT_TIMESTAMP << 8) |
978                                      TCPOLEN_TIMESTAMP);
979                 rep.tsopt[1] = htonl(tcp_time_stamp);
980                 rep.tsopt[2] = htonl(ts);
981                 arg.iov[0].iov_len = sizeof(rep);
982         }
983
984         /* Swap the send and the receive. */
985         rep.th.dest    = th->source;
986         rep.th.source  = th->dest;
987         rep.th.doff    = arg.iov[0].iov_len / 4;
988         rep.th.seq     = htonl(seq);
989         rep.th.ack_seq = htonl(ack);
990         rep.th.ack     = 1;
991         rep.th.window  = htons(win);
992
993         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
994                                       skb->nh.iph->saddr, /*XXX*/
995                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
996         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
997
998         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
999
1000         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1001 }
1002
1003 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1004 {
1005         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1006
1007         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1008                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1009
1010         tcp_tw_put(tw);
1011 }
1012
1013 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1014 {
1015         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1016                         req->ts_recent);
1017 }
1018
1019 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1020                                           struct request_sock *req)
1021 {
1022         struct rtable *rt;
1023         const struct inet_request_sock *ireq = inet_rsk(req);
1024         struct ip_options *opt = inet_rsk(req)->opt;
1025         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1026                             .nl_u = { .ip4_u =
1027                                       { .daddr = ((opt && opt->srr) ?
1028                                                   opt->faddr :
1029                                                   ireq->rmt_addr),
1030                                         .saddr = ireq->loc_addr,
1031                                         .tos = RT_CONN_FLAGS(sk) } },
1032                             .proto = IPPROTO_TCP,
1033                             .uli_u = { .ports =
1034                                        { .sport = inet_sk(sk)->sport,
1035                                          .dport = ireq->rmt_port } } };
1036
1037         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1038                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1039                 return NULL;
1040         }
1041         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1042                 ip_rt_put(rt);
1043                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1044                 return NULL;
1045         }
1046         return &rt->u.dst;
1047 }
1048
1049 /*
1050  *      Send a SYN-ACK after having received an ACK.
1051  *      This still operates on a request_sock only, not on a big
1052  *      socket.
1053  */
1054 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1055                               struct dst_entry *dst)
1056 {
1057         const struct inet_request_sock *ireq = inet_rsk(req);
1058         int err = -1;
1059         struct sk_buff * skb;
1060
1061         /* First, grab a route. */
1062         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1063                 goto out;
1064
1065         skb = tcp_make_synack(sk, dst, req);
1066
1067         if (skb) {
1068                 struct tcphdr *th = skb->h.th;
1069
1070                 th->check = tcp_v4_check(th, skb->len,
1071                                          ireq->loc_addr,
1072                                          ireq->rmt_addr,
1073                                          csum_partial((char *)th, skb->len,
1074                                                       skb->csum));
1075
1076                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1077                                             ireq->rmt_addr,
1078                                             ireq->opt);
1079                 if (err == NET_XMIT_CN)
1080                         err = 0;
1081         }
1082
1083 out:
1084         dst_release(dst);
1085         return err;
1086 }
1087
1088 /*
1089  *      IPv4 request_sock destructor.
1090  */
1091 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1092 {
1093         if (inet_rsk(req)->opt)
1094                 kfree(inet_rsk(req)->opt);
1095 }
1096
1097 static inline void syn_flood_warning(struct sk_buff *skb)
1098 {
1099         static unsigned long warntime;
1100
1101         if (time_after(jiffies, (warntime + HZ * 60))) {
1102                 warntime = jiffies;
1103                 printk(KERN_INFO
1104                        "possible SYN flooding on port %d. Sending cookies.\n",
1105                        ntohs(skb->h.th->dest));
1106         }
1107 }
1108
1109 /*
1110  * Save and compile IPv4 options into the request_sock if needed.
1111  */
1112 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1113                                                      struct sk_buff *skb)
1114 {
1115         struct ip_options *opt = &(IPCB(skb)->opt);
1116         struct ip_options *dopt = NULL;
1117
1118         if (opt && opt->optlen) {
1119                 int opt_size = optlength(opt);
1120                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1121                 if (dopt) {
1122                         if (ip_options_echo(dopt, skb)) {
1123                                 kfree(dopt);
1124                                 dopt = NULL;
1125                         }
1126                 }
1127         }
1128         return dopt;
1129 }
1130
1131 struct request_sock_ops tcp_request_sock_ops = {
1132         .family         =       PF_INET,
1133         .obj_size       =       sizeof(struct tcp_request_sock),
1134         .rtx_syn_ack    =       tcp_v4_send_synack,
1135         .send_ack       =       tcp_v4_reqsk_send_ack,
1136         .destructor     =       tcp_v4_reqsk_destructor,
1137         .send_reset     =       tcp_v4_send_reset,
1138 };
1139
1140 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1141 {
1142         struct inet_request_sock *ireq;
1143         struct tcp_options_received tmp_opt;
1144         struct request_sock *req;
1145         __u32 saddr = skb->nh.iph->saddr;
1146         __u32 daddr = skb->nh.iph->daddr;
1147         __u32 isn = TCP_SKB_CB(skb)->when;
1148         struct dst_entry *dst = NULL;
1149 #ifdef CONFIG_SYN_COOKIES
1150         int want_cookie = 0;
1151 #else
1152 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1153 #endif
1154
1155         /* Never answer to SYNs send to broadcast or multicast */
1156         if (((struct rtable *)skb->dst)->rt_flags &
1157             (RTCF_BROADCAST | RTCF_MULTICAST))
1158                 goto drop;
1159
1160         /* TW buckets are converted to open requests without
1161          * limitations, they conserve resources and peer is
1162          * evidently real one.
1163          */
1164         if (tcp_synq_is_full(sk) && !isn) {
1165 #ifdef CONFIG_SYN_COOKIES
1166                 if (sysctl_tcp_syncookies) {
1167                         want_cookie = 1;
1168                 } else
1169 #endif
1170                 goto drop;
1171         }
1172
1173         /* Accept backlog is full. If we have already queued enough
1174          * of warm entries in syn queue, drop request. It is better than
1175          * clogging syn queue with openreqs with exponentially increasing
1176          * timeout.
1177          */
1178         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1179                 goto drop;
1180
1181         req = reqsk_alloc(&tcp_request_sock_ops);
1182         if (!req)
1183                 goto drop;
1184
1185         tcp_clear_options(&tmp_opt);
1186         tmp_opt.mss_clamp = 536;
1187         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1188
1189         tcp_parse_options(skb, &tmp_opt, 0);
1190
1191         if (want_cookie) {
1192                 tcp_clear_options(&tmp_opt);
1193                 tmp_opt.saw_tstamp = 0;
1194         }
1195
1196         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1197                 /* Some OSes (unknown ones, but I see them on web server, which
1198                  * contains information interesting only for windows'
1199                  * users) do not send their stamp in SYN. It is easy case.
1200                  * We simply do not advertise TS support.
1201                  */
1202                 tmp_opt.saw_tstamp = 0;
1203                 tmp_opt.tstamp_ok  = 0;
1204         }
1205         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1206
1207         tcp_openreq_init(req, &tmp_opt, skb);
1208
1209         ireq = inet_rsk(req);
1210         ireq->loc_addr = daddr;
1211         ireq->rmt_addr = saddr;
1212         ireq->opt = tcp_v4_save_options(sk, skb);
1213         if (!want_cookie)
1214                 TCP_ECN_create_request(req, skb->h.th);
1215
1216         if (want_cookie) {
1217 #ifdef CONFIG_SYN_COOKIES
1218                 syn_flood_warning(skb);
1219 #endif
1220                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1221         } else if (!isn) {
1222                 struct inet_peer *peer = NULL;
1223
1224                 /* VJ's idea. We save last timestamp seen
1225                  * from the destination in peer table, when entering
1226                  * state TIME-WAIT, and check against it before
1227                  * accepting new connection request.
1228                  *
1229                  * If "isn" is not zero, this request hit alive
1230                  * timewait bucket, so that all the necessary checks
1231                  * are made in the function processing timewait state.
1232                  */
1233                 if (tmp_opt.saw_tstamp &&
1234                     sysctl_tcp_tw_recycle &&
1235                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1236                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1237                     peer->v4daddr == saddr) {
1238                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1239                             (s32)(peer->tcp_ts - req->ts_recent) >
1240                                                         TCP_PAWS_WINDOW) {
1241                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1242                                 dst_release(dst);
1243                                 goto drop_and_free;
1244                         }
1245                 }
1246                 /* Kill the following clause, if you dislike this way. */
1247                 else if (!sysctl_tcp_syncookies &&
1248                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1249                           (sysctl_max_syn_backlog >> 2)) &&
1250                          (!peer || !peer->tcp_ts_stamp) &&
1251                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1252                         /* Without syncookies last quarter of
1253                          * backlog is filled with destinations,
1254                          * proven to be alive.
1255                          * It means that we continue to communicate
1256                          * to destinations, already remembered
1257                          * to the moment of synflood.
1258                          */
1259                         LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1260                                               "request from %u.%u."
1261                                               "%u.%u/%u\n",
1262                                               NIPQUAD(saddr),
1263                                               ntohs(skb->h.th->source)));
1264                         dst_release(dst);
1265                         goto drop_and_free;
1266                 }
1267
1268                 isn = tcp_v4_init_sequence(sk, skb);
1269         }
1270         tcp_rsk(req)->snt_isn = isn;
1271
1272         if (tcp_v4_send_synack(sk, req, dst))
1273                 goto drop_and_free;
1274
1275         if (want_cookie) {
1276                 reqsk_free(req);
1277         } else {
1278                 tcp_v4_synq_add(sk, req);
1279         }
1280         return 0;
1281
1282 drop_and_free:
1283         reqsk_free(req);
1284 drop:
1285         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1286         return 0;
1287 }
1288
1289
1290 /*
1291  * The three way handshake has completed - we got a valid synack -
1292  * now create the new socket.
1293  */
1294 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1295                                   struct request_sock *req,
1296                                   struct dst_entry *dst)
1297 {
1298         struct inet_request_sock *ireq;
1299         struct inet_sock *newinet;
1300         struct tcp_sock *newtp;
1301         struct sock *newsk;
1302
1303         if (sk_acceptq_is_full(sk))
1304                 goto exit_overflow;
1305
1306         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1307                 goto exit;
1308
1309         newsk = tcp_create_openreq_child(sk, req, skb);
1310         if (!newsk)
1311                 goto exit;
1312
1313         sk_setup_caps(newsk, dst);
1314
1315         newtp                 = tcp_sk(newsk);
1316         newinet               = inet_sk(newsk);
1317         ireq                  = inet_rsk(req);
1318         newinet->daddr        = ireq->rmt_addr;
1319         newinet->rcv_saddr    = ireq->loc_addr;
1320         newinet->saddr        = ireq->loc_addr;
1321         newinet->opt          = ireq->opt;
1322         ireq->opt             = NULL;
1323         newinet->mc_index     = tcp_v4_iif(skb);
1324         newinet->mc_ttl       = skb->nh.iph->ttl;
1325         newtp->ext_header_len = 0;
1326         if (newinet->opt)
1327                 newtp->ext_header_len = newinet->opt->optlen;
1328         newinet->id = newtp->write_seq ^ jiffies;
1329
1330         tcp_sync_mss(newsk, dst_mtu(dst));
1331         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1332         tcp_initialize_rcv_mss(newsk);
1333
1334         __inet_hash(&tcp_hashinfo, newsk, 0);
1335         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1336
1337         return newsk;
1338
1339 exit_overflow:
1340         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1341 exit:
1342         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1343         dst_release(dst);
1344         return NULL;
1345 }
1346
1347 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1348 {
1349         struct tcphdr *th = skb->h.th;
1350         struct iphdr *iph = skb->nh.iph;
1351         struct tcp_sock *tp = tcp_sk(sk);
1352         struct sock *nsk;
1353         struct request_sock **prev;
1354         /* Find possible connection requests. */
1355         struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1356                                                      iph->saddr, iph->daddr);
1357         if (req)
1358                 return tcp_check_req(sk, skb, req, prev);
1359
1360         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1361                                           th->source,
1362                                           skb->nh.iph->daddr,
1363                                           ntohs(th->dest),
1364                                           tcp_v4_iif(skb));
1365
1366         if (nsk) {
1367                 if (nsk->sk_state != TCP_TIME_WAIT) {
1368                         bh_lock_sock(nsk);
1369                         return nsk;
1370                 }
1371                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1372                 return NULL;
1373         }
1374
1375 #ifdef CONFIG_SYN_COOKIES
1376         if (!th->rst && !th->syn && th->ack)
1377                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1378 #endif
1379         return sk;
1380 }
1381
1382 static int tcp_v4_checksum_init(struct sk_buff *skb)
1383 {
1384         if (skb->ip_summed == CHECKSUM_HW) {
1385                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1386                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1387                                   skb->nh.iph->daddr, skb->csum))
1388                         return 0;
1389
1390                 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1391                 skb->ip_summed = CHECKSUM_NONE;
1392         }
1393         if (skb->len <= 76) {
1394                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1395                                  skb->nh.iph->daddr,
1396                                  skb_checksum(skb, 0, skb->len, 0)))
1397                         return -1;
1398                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1399         } else {
1400                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1401                                           skb->nh.iph->saddr,
1402                                           skb->nh.iph->daddr, 0);
1403         }
1404         return 0;
1405 }
1406
1407
1408 /* The socket must have it's spinlock held when we get
1409  * here.
1410  *
1411  * We have a potential double-lock case here, so even when
1412  * doing backlog processing we use the BH locking scheme.
1413  * This is because we cannot sleep with the original spinlock
1414  * held.
1415  */
1416 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1417 {
1418         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1419                 TCP_CHECK_TIMER(sk);
1420                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1421                         goto reset;
1422                 TCP_CHECK_TIMER(sk);
1423                 return 0;
1424         }
1425
1426         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1427                 goto csum_err;
1428
1429         if (sk->sk_state == TCP_LISTEN) {
1430                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1431                 if (!nsk)
1432                         goto discard;
1433
1434                 if (nsk != sk) {
1435                         if (tcp_child_process(sk, nsk, skb))
1436                                 goto reset;
1437                         return 0;
1438                 }
1439         }
1440
1441         TCP_CHECK_TIMER(sk);
1442         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1443                 goto reset;
1444         TCP_CHECK_TIMER(sk);
1445         return 0;
1446
1447 reset:
1448         tcp_v4_send_reset(skb);
1449 discard:
1450         kfree_skb(skb);
1451         /* Be careful here. If this function gets more complicated and
1452          * gcc suffers from register pressure on the x86, sk (in %ebx)
1453          * might be destroyed here. This current version compiles correctly,
1454          * but you have been warned.
1455          */
1456         return 0;
1457
1458 csum_err:
1459         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1460         goto discard;
1461 }
1462
1463 /*
1464  *      From tcp_input.c
1465  */
1466
1467 int tcp_v4_rcv(struct sk_buff *skb)
1468 {
1469         struct tcphdr *th;
1470         struct sock *sk;
1471         int ret;
1472
1473         if (skb->pkt_type != PACKET_HOST)
1474                 goto discard_it;
1475
1476         /* Count it even if it's bad */
1477         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1478
1479         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1480                 goto discard_it;
1481
1482         th = skb->h.th;
1483
1484         if (th->doff < sizeof(struct tcphdr) / 4)
1485                 goto bad_packet;
1486         if (!pskb_may_pull(skb, th->doff * 4))
1487                 goto discard_it;
1488
1489         /* An explanation is required here, I think.
1490          * Packet length and doff are validated by header prediction,
1491          * provided case of th->doff==0 is elimineted.
1492          * So, we defer the checks. */
1493         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1494              tcp_v4_checksum_init(skb) < 0))
1495                 goto bad_packet;
1496
1497         th = skb->h.th;
1498         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1499         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1500                                     skb->len - th->doff * 4);
1501         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1502         TCP_SKB_CB(skb)->when    = 0;
1503         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1504         TCP_SKB_CB(skb)->sacked  = 0;
1505
1506         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1507                              skb->nh.iph->daddr, ntohs(th->dest),
1508                              tcp_v4_iif(skb));
1509
1510         if (!sk)
1511                 goto no_tcp_socket;
1512
1513 process:
1514         if (sk->sk_state == TCP_TIME_WAIT)
1515                 goto do_time_wait;
1516
1517         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1518                 goto discard_and_relse;
1519
1520         if (sk_filter(sk, skb, 0))
1521                 goto discard_and_relse;
1522
1523         skb->dev = NULL;
1524
1525         bh_lock_sock(sk);
1526         ret = 0;
1527         if (!sock_owned_by_user(sk)) {
1528                 if (!tcp_prequeue(sk, skb))
1529                         ret = tcp_v4_do_rcv(sk, skb);
1530         } else
1531                 sk_add_backlog(sk, skb);
1532         bh_unlock_sock(sk);
1533
1534         sock_put(sk);
1535
1536         return ret;
1537
1538 no_tcp_socket:
1539         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1540                 goto discard_it;
1541
1542         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1543 bad_packet:
1544                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1545         } else {
1546                 tcp_v4_send_reset(skb);
1547         }
1548
1549 discard_it:
1550         /* Discard frame. */
1551         kfree_skb(skb);
1552         return 0;
1553
1554 discard_and_relse:
1555         sock_put(sk);
1556         goto discard_it;
1557
1558 do_time_wait:
1559         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1560                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1561                 goto discard_it;
1562         }
1563
1564         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1565                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1566                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1567                 goto discard_it;
1568         }
1569         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1570                                            skb, th, skb->len)) {
1571         case TCP_TW_SYN: {
1572                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1573                                                         skb->nh.iph->daddr,
1574                                                         ntohs(th->dest),
1575                                                         tcp_v4_iif(skb));
1576                 if (sk2) {
1577                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1578                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1579                         sk = sk2;
1580                         goto process;
1581                 }
1582                 /* Fall through to ACK */
1583         }
1584         case TCP_TW_ACK:
1585                 tcp_v4_timewait_ack(sk, skb);
1586                 break;
1587         case TCP_TW_RST:
1588                 goto no_tcp_socket;
1589         case TCP_TW_SUCCESS:;
1590         }
1591         goto discard_it;
1592 }
1593
1594 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1595 {
1596         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1597         struct inet_sock *inet = inet_sk(sk);
1598
1599         sin->sin_family         = AF_INET;
1600         sin->sin_addr.s_addr    = inet->daddr;
1601         sin->sin_port           = inet->dport;
1602 }
1603
1604 /* VJ's idea. Save last timestamp seen from this destination
1605  * and hold it at least for normal timewait interval to use for duplicate
1606  * segment detection in subsequent connections, before they enter synchronized
1607  * state.
1608  */
1609
1610 int tcp_v4_remember_stamp(struct sock *sk)
1611 {
1612         struct inet_sock *inet = inet_sk(sk);
1613         struct tcp_sock *tp = tcp_sk(sk);
1614         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1615         struct inet_peer *peer = NULL;
1616         int release_it = 0;
1617
1618         if (!rt || rt->rt_dst != inet->daddr) {
1619                 peer = inet_getpeer(inet->daddr, 1);
1620                 release_it = 1;
1621         } else {
1622                 if (!rt->peer)
1623                         rt_bind_peer(rt, 1);
1624                 peer = rt->peer;
1625         }
1626
1627         if (peer) {
1628                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1629                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1630                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1631                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1632                         peer->tcp_ts = tp->rx_opt.ts_recent;
1633                 }
1634                 if (release_it)
1635                         inet_putpeer(peer);
1636                 return 1;
1637         }
1638
1639         return 0;
1640 }
1641
1642 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1643 {
1644         struct inet_peer *peer = NULL;
1645
1646         peer = inet_getpeer(tw->tw_daddr, 1);
1647
1648         if (peer) {
1649                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1650                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1651                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1652                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1653                         peer->tcp_ts = tw->tw_ts_recent;
1654                 }
1655                 inet_putpeer(peer);
1656                 return 1;
1657         }
1658
1659         return 0;
1660 }
1661
1662 struct tcp_func ipv4_specific = {
1663         .queue_xmit     =       ip_queue_xmit,
1664         .send_check     =       tcp_v4_send_check,
1665         .rebuild_header =       inet_sk_rebuild_header,
1666         .conn_request   =       tcp_v4_conn_request,
1667         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1668         .remember_stamp =       tcp_v4_remember_stamp,
1669         .net_header_len =       sizeof(struct iphdr),
1670         .setsockopt     =       ip_setsockopt,
1671         .getsockopt     =       ip_getsockopt,
1672         .addr2sockaddr  =       v4_addr2sockaddr,
1673         .sockaddr_len   =       sizeof(struct sockaddr_in),
1674 };
1675
1676 /* NOTE: A lot of things set to zero explicitly by call to
1677  *       sk_alloc() so need not be done here.
1678  */
1679 static int tcp_v4_init_sock(struct sock *sk)
1680 {
1681         struct tcp_sock *tp = tcp_sk(sk);
1682
1683         skb_queue_head_init(&tp->out_of_order_queue);
1684         tcp_init_xmit_timers(sk);
1685         tcp_prequeue_init(tp);
1686
1687         tp->rto  = TCP_TIMEOUT_INIT;
1688         tp->mdev = TCP_TIMEOUT_INIT;
1689
1690         /* So many TCP implementations out there (incorrectly) count the
1691          * initial SYN frame in their delayed-ACK and congestion control
1692          * algorithms that we must have the following bandaid to talk
1693          * efficiently to them.  -DaveM
1694          */
1695         tp->snd_cwnd = 2;
1696
1697         /* See draft-stevens-tcpca-spec-01 for discussion of the
1698          * initialization of these values.
1699          */
1700         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1701         tp->snd_cwnd_clamp = ~0;
1702         tp->mss_cache = 536;
1703
1704         tp->reordering = sysctl_tcp_reordering;
1705         tp->ca_ops = &tcp_init_congestion_ops;
1706
1707         sk->sk_state = TCP_CLOSE;
1708
1709         sk->sk_write_space = sk_stream_write_space;
1710         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1711
1712         tp->af_specific = &ipv4_specific;
1713
1714         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1715         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1716
1717         atomic_inc(&tcp_sockets_allocated);
1718
1719         return 0;
1720 }
1721
1722 int tcp_v4_destroy_sock(struct sock *sk)
1723 {
1724         struct tcp_sock *tp = tcp_sk(sk);
1725
1726         tcp_clear_xmit_timers(sk);
1727
1728         tcp_cleanup_congestion_control(tp);
1729
1730         /* Cleanup up the write buffer. */
1731         sk_stream_writequeue_purge(sk);
1732
1733         /* Cleans up our, hopefully empty, out_of_order_queue. */
1734         __skb_queue_purge(&tp->out_of_order_queue);
1735
1736         /* Clean prequeue, it must be empty really */
1737         __skb_queue_purge(&tp->ucopy.prequeue);
1738
1739         /* Clean up a referenced TCP bind bucket. */
1740         if (inet_sk(sk)->bind_hash)
1741                 inet_put_port(&tcp_hashinfo, sk);
1742
1743         /*
1744          * If sendmsg cached page exists, toss it.
1745          */
1746         if (sk->sk_sndmsg_page) {
1747                 __free_page(sk->sk_sndmsg_page);
1748                 sk->sk_sndmsg_page = NULL;
1749         }
1750
1751         atomic_dec(&tcp_sockets_allocated);
1752
1753         return 0;
1754 }
1755
1756 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1757
1758 #ifdef CONFIG_PROC_FS
1759 /* Proc filesystem TCP sock list dumping. */
1760
1761 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1762 {
1763         return hlist_empty(head) ? NULL :
1764                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1765 }
1766
1767 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1768 {
1769         return tw->tw_node.next ?
1770                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1771 }
1772
1773 static void *listening_get_next(struct seq_file *seq, void *cur)
1774 {
1775         struct tcp_sock *tp;
1776         struct hlist_node *node;
1777         struct sock *sk = cur;
1778         struct tcp_iter_state* st = seq->private;
1779
1780         if (!sk) {
1781                 st->bucket = 0;
1782                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1783                 goto get_sk;
1784         }
1785
1786         ++st->num;
1787
1788         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1789                 struct request_sock *req = cur;
1790
1791                 tp = tcp_sk(st->syn_wait_sk);
1792                 req = req->dl_next;
1793                 while (1) {
1794                         while (req) {
1795                                 if (req->rsk_ops->family == st->family) {
1796                                         cur = req;
1797                                         goto out;
1798                                 }
1799                                 req = req->dl_next;
1800                         }
1801                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
1802                                 break;
1803 get_req:
1804                         req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
1805                 }
1806                 sk        = sk_next(st->syn_wait_sk);
1807                 st->state = TCP_SEQ_STATE_LISTENING;
1808                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1809         } else {
1810                 tp = tcp_sk(sk);
1811                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1812                 if (reqsk_queue_len(&tp->accept_queue))
1813                         goto start_req;
1814                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1815                 sk = sk_next(sk);
1816         }
1817 get_sk:
1818         sk_for_each_from(sk, node) {
1819                 if (sk->sk_family == st->family) {
1820                         cur = sk;
1821                         goto out;
1822                 }
1823                 tp = tcp_sk(sk);
1824                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1825                 if (reqsk_queue_len(&tp->accept_queue)) {
1826 start_req:
1827                         st->uid         = sock_i_uid(sk);
1828                         st->syn_wait_sk = sk;
1829                         st->state       = TCP_SEQ_STATE_OPENREQ;
1830                         st->sbucket     = 0;
1831                         goto get_req;
1832                 }
1833                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1834         }
1835         if (++st->bucket < INET_LHTABLE_SIZE) {
1836                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1837                 goto get_sk;
1838         }
1839         cur = NULL;
1840 out:
1841         return cur;
1842 }
1843
1844 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1845 {
1846         void *rc = listening_get_next(seq, NULL);
1847
1848         while (rc && *pos) {
1849                 rc = listening_get_next(seq, rc);
1850                 --*pos;
1851         }
1852         return rc;
1853 }
1854
1855 static void *established_get_first(struct seq_file *seq)
1856 {
1857         struct tcp_iter_state* st = seq->private;
1858         void *rc = NULL;
1859
1860         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1861                 struct sock *sk;
1862                 struct hlist_node *node;
1863                 struct tcp_tw_bucket *tw;
1864
1865                 /* We can reschedule _before_ having picked the target: */
1866                 cond_resched_softirq();
1867
1868                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1869                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1870                         if (sk->sk_family != st->family) {
1871                                 continue;
1872                         }
1873                         rc = sk;
1874                         goto out;
1875                 }
1876                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1877                 tw_for_each(tw, node,
1878                             &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1879                         if (tw->tw_family != st->family) {
1880                                 continue;
1881                         }
1882                         rc = tw;
1883                         goto out;
1884                 }
1885                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1886                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1887         }
1888 out:
1889         return rc;
1890 }
1891
1892 static void *established_get_next(struct seq_file *seq, void *cur)
1893 {
1894         struct sock *sk = cur;
1895         struct tcp_tw_bucket *tw;
1896         struct hlist_node *node;
1897         struct tcp_iter_state* st = seq->private;
1898
1899         ++st->num;
1900
1901         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1902                 tw = cur;
1903                 tw = tw_next(tw);
1904 get_tw:
1905                 while (tw && tw->tw_family != st->family) {
1906                         tw = tw_next(tw);
1907                 }
1908                 if (tw) {
1909                         cur = tw;
1910                         goto out;
1911                 }
1912                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1913                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1914
1915                 /* We can reschedule between buckets: */
1916                 cond_resched_softirq();
1917
1918                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1919                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1920                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1921                 } else {
1922                         cur = NULL;
1923                         goto out;
1924                 }
1925         } else
1926                 sk = sk_next(sk);
1927
1928         sk_for_each_from(sk, node) {
1929                 if (sk->sk_family == st->family)
1930                         goto found;
1931         }
1932
1933         st->state = TCP_SEQ_STATE_TIME_WAIT;
1934         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1935         goto get_tw;
1936 found:
1937         cur = sk;
1938 out:
1939         return cur;
1940 }
1941
1942 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1943 {
1944         void *rc = established_get_first(seq);
1945
1946         while (rc && pos) {
1947                 rc = established_get_next(seq, rc);
1948                 --pos;
1949         }               
1950         return rc;
1951 }
1952
1953 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1954 {
1955         void *rc;
1956         struct tcp_iter_state* st = seq->private;
1957
1958         inet_listen_lock(&tcp_hashinfo);
1959         st->state = TCP_SEQ_STATE_LISTENING;
1960         rc        = listening_get_idx(seq, &pos);
1961
1962         if (!rc) {
1963                 inet_listen_unlock(&tcp_hashinfo);
1964                 local_bh_disable();
1965                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1966                 rc        = established_get_idx(seq, pos);
1967         }
1968
1969         return rc;
1970 }
1971
1972 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1973 {
1974         struct tcp_iter_state* st = seq->private;
1975         st->state = TCP_SEQ_STATE_LISTENING;
1976         st->num = 0;
1977         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1978 }
1979
1980 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1981 {
1982         void *rc = NULL;
1983         struct tcp_iter_state* st;
1984
1985         if (v == SEQ_START_TOKEN) {
1986                 rc = tcp_get_idx(seq, 0);
1987                 goto out;
1988         }
1989         st = seq->private;
1990
1991         switch (st->state) {
1992         case TCP_SEQ_STATE_OPENREQ:
1993         case TCP_SEQ_STATE_LISTENING:
1994                 rc = listening_get_next(seq, v);
1995                 if (!rc) {
1996                         inet_listen_unlock(&tcp_hashinfo);
1997                         local_bh_disable();
1998                         st->state = TCP_SEQ_STATE_ESTABLISHED;
1999                         rc        = established_get_first(seq);
2000                 }
2001                 break;
2002         case TCP_SEQ_STATE_ESTABLISHED:
2003         case TCP_SEQ_STATE_TIME_WAIT:
2004                 rc = established_get_next(seq, v);
2005                 break;
2006         }
2007 out:
2008         ++*pos;
2009         return rc;
2010 }
2011
2012 static void tcp_seq_stop(struct seq_file *seq, void *v)
2013 {
2014         struct tcp_iter_state* st = seq->private;
2015
2016         switch (st->state) {
2017         case TCP_SEQ_STATE_OPENREQ:
2018                 if (v) {
2019                         struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2020                         read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2021                 }
2022         case TCP_SEQ_STATE_LISTENING:
2023                 if (v != SEQ_START_TOKEN)
2024                         inet_listen_unlock(&tcp_hashinfo);
2025                 break;
2026         case TCP_SEQ_STATE_TIME_WAIT:
2027         case TCP_SEQ_STATE_ESTABLISHED:
2028                 if (v)
2029                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2030                 local_bh_enable();
2031                 break;
2032         }
2033 }
2034
2035 static int tcp_seq_open(struct inode *inode, struct file *file)
2036 {
2037         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2038         struct seq_file *seq;
2039         struct tcp_iter_state *s;
2040         int rc;
2041
2042         if (unlikely(afinfo == NULL))
2043                 return -EINVAL;
2044
2045         s = kmalloc(sizeof(*s), GFP_KERNEL);
2046         if (!s)
2047                 return -ENOMEM;
2048         memset(s, 0, sizeof(*s));
2049         s->family               = afinfo->family;
2050         s->seq_ops.start        = tcp_seq_start;
2051         s->seq_ops.next         = tcp_seq_next;
2052         s->seq_ops.show         = afinfo->seq_show;
2053         s->seq_ops.stop         = tcp_seq_stop;
2054
2055         rc = seq_open(file, &s->seq_ops);
2056         if (rc)
2057                 goto out_kfree;
2058         seq          = file->private_data;
2059         seq->private = s;
2060 out:
2061         return rc;
2062 out_kfree:
2063         kfree(s);
2064         goto out;
2065 }
2066
2067 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2068 {
2069         int rc = 0;
2070         struct proc_dir_entry *p;
2071
2072         if (!afinfo)
2073                 return -EINVAL;
2074         afinfo->seq_fops->owner         = afinfo->owner;
2075         afinfo->seq_fops->open          = tcp_seq_open;
2076         afinfo->seq_fops->read          = seq_read;
2077         afinfo->seq_fops->llseek        = seq_lseek;
2078         afinfo->seq_fops->release       = seq_release_private;
2079         
2080         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2081         if (p)
2082                 p->data = afinfo;
2083         else
2084                 rc = -ENOMEM;
2085         return rc;
2086 }
2087
2088 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2089 {
2090         if (!afinfo)
2091                 return;
2092         proc_net_remove(afinfo->name);
2093         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
2094 }
2095
2096 static void get_openreq4(struct sock *sk, struct request_sock *req,
2097                          char *tmpbuf, int i, int uid)
2098 {
2099         const struct inet_request_sock *ireq = inet_rsk(req);
2100         int ttd = req->expires - jiffies;
2101
2102         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2103                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2104                 i,
2105                 ireq->loc_addr,
2106                 ntohs(inet_sk(sk)->sport),
2107                 ireq->rmt_addr,
2108                 ntohs(ireq->rmt_port),
2109                 TCP_SYN_RECV,
2110                 0, 0, /* could print option size, but that is af dependent. */
2111                 1,    /* timers active (only the expire timer) */
2112                 jiffies_to_clock_t(ttd),
2113                 req->retrans,
2114                 uid,
2115                 0,  /* non standard timer */
2116                 0, /* open_requests have no inode */
2117                 atomic_read(&sk->sk_refcnt),
2118                 req);
2119 }
2120
2121 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2122 {
2123         int timer_active;
2124         unsigned long timer_expires;
2125         struct tcp_sock *tp = tcp_sk(sp);
2126         struct inet_sock *inet = inet_sk(sp);
2127         unsigned int dest = inet->daddr;
2128         unsigned int src = inet->rcv_saddr;
2129         __u16 destp = ntohs(inet->dport);
2130         __u16 srcp = ntohs(inet->sport);
2131
2132         if (tp->pending == TCP_TIME_RETRANS) {
2133                 timer_active    = 1;
2134                 timer_expires   = tp->timeout;
2135         } else if (tp->pending == TCP_TIME_PROBE0) {
2136                 timer_active    = 4;
2137                 timer_expires   = tp->timeout;
2138         } else if (timer_pending(&sp->sk_timer)) {
2139                 timer_active    = 2;
2140                 timer_expires   = sp->sk_timer.expires;
2141         } else {
2142                 timer_active    = 0;
2143                 timer_expires = jiffies;
2144         }
2145
2146         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2147                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2148                 i, src, srcp, dest, destp, sp->sk_state,
2149                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2150                 timer_active,
2151                 jiffies_to_clock_t(timer_expires - jiffies),
2152                 tp->retransmits,
2153                 sock_i_uid(sp),
2154                 tp->probes_out,
2155                 sock_i_ino(sp),
2156                 atomic_read(&sp->sk_refcnt), sp,
2157                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2158                 tp->snd_cwnd,
2159                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2160 }
2161
2162 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2163 {
2164         unsigned int dest, src;
2165         __u16 destp, srcp;
2166         int ttd = tw->tw_ttd - jiffies;
2167
2168         if (ttd < 0)
2169                 ttd = 0;
2170
2171         dest  = tw->tw_daddr;
2172         src   = tw->tw_rcv_saddr;
2173         destp = ntohs(tw->tw_dport);
2174         srcp  = ntohs(tw->tw_sport);
2175
2176         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2177                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2178                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2179                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2180                 atomic_read(&tw->tw_refcnt), tw);
2181 }
2182
2183 #define TMPSZ 150
2184
2185 static int tcp4_seq_show(struct seq_file *seq, void *v)
2186 {
2187         struct tcp_iter_state* st;
2188         char tmpbuf[TMPSZ + 1];
2189
2190         if (v == SEQ_START_TOKEN) {
2191                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2192                            "  sl  local_address rem_address   st tx_queue "
2193                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2194                            "inode");
2195                 goto out;
2196         }
2197         st = seq->private;
2198
2199         switch (st->state) {
2200         case TCP_SEQ_STATE_LISTENING:
2201         case TCP_SEQ_STATE_ESTABLISHED:
2202                 get_tcp4_sock(v, tmpbuf, st->num);
2203                 break;
2204         case TCP_SEQ_STATE_OPENREQ:
2205                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2206                 break;
2207         case TCP_SEQ_STATE_TIME_WAIT:
2208                 get_timewait4_sock(v, tmpbuf, st->num);
2209                 break;
2210         }
2211         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2212 out:
2213         return 0;
2214 }
2215
2216 static struct file_operations tcp4_seq_fops;
2217 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2218         .owner          = THIS_MODULE,
2219         .name           = "tcp",
2220         .family         = AF_INET,
2221         .seq_show       = tcp4_seq_show,
2222         .seq_fops       = &tcp4_seq_fops,
2223 };
2224
2225 int __init tcp4_proc_init(void)
2226 {
2227         return tcp_proc_register(&tcp4_seq_afinfo);
2228 }
2229
2230 void tcp4_proc_exit(void)
2231 {
2232         tcp_proc_unregister(&tcp4_seq_afinfo);
2233 }
2234 #endif /* CONFIG_PROC_FS */
2235
2236 struct proto tcp_prot = {
2237         .name                   = "TCP",
2238         .owner                  = THIS_MODULE,
2239         .close                  = tcp_close,
2240         .connect                = tcp_v4_connect,
2241         .disconnect             = tcp_disconnect,
2242         .accept                 = tcp_accept,
2243         .ioctl                  = tcp_ioctl,
2244         .init                   = tcp_v4_init_sock,
2245         .destroy                = tcp_v4_destroy_sock,
2246         .shutdown               = tcp_shutdown,
2247         .setsockopt             = tcp_setsockopt,
2248         .getsockopt             = tcp_getsockopt,
2249         .sendmsg                = tcp_sendmsg,
2250         .recvmsg                = tcp_recvmsg,
2251         .backlog_rcv            = tcp_v4_do_rcv,
2252         .hash                   = tcp_v4_hash,
2253         .unhash                 = tcp_unhash,
2254         .get_port               = tcp_v4_get_port,
2255         .enter_memory_pressure  = tcp_enter_memory_pressure,
2256         .sockets_allocated      = &tcp_sockets_allocated,
2257         .memory_allocated       = &tcp_memory_allocated,
2258         .memory_pressure        = &tcp_memory_pressure,
2259         .sysctl_mem             = sysctl_tcp_mem,
2260         .sysctl_wmem            = sysctl_tcp_wmem,
2261         .sysctl_rmem            = sysctl_tcp_rmem,
2262         .max_header             = MAX_TCP_HEADER,
2263         .obj_size               = sizeof(struct tcp_sock),
2264         .rsk_prot               = &tcp_request_sock_ops,
2265 };
2266
2267
2268
2269 void __init tcp_v4_init(struct net_proto_family *ops)
2270 {
2271         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2272         if (err < 0)
2273                 panic("Failed to create the TCP control socket.\n");
2274         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2275         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2276
2277         /* Unhash it so that IP input processing does not even
2278          * see it, we do not wish this socket to see incoming
2279          * packets.
2280          */
2281         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2282 }
2283
2284 EXPORT_SYMBOL(ipv4_specific);
2285 EXPORT_SYMBOL(inet_bind_bucket_create);
2286 EXPORT_SYMBOL(tcp_hashinfo);
2287 EXPORT_SYMBOL(tcp_prot);
2288 EXPORT_SYMBOL(tcp_unhash);
2289 EXPORT_SYMBOL(tcp_v4_conn_request);
2290 EXPORT_SYMBOL(tcp_v4_connect);
2291 EXPORT_SYMBOL(tcp_v4_do_rcv);
2292 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2293 EXPORT_SYMBOL(tcp_v4_send_check);
2294 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2295
2296 #ifdef CONFIG_PROC_FS
2297 EXPORT_SYMBOL(tcp_proc_register);
2298 EXPORT_SYMBOL(tcp_proc_unregister);
2299 #endif
2300 EXPORT_SYMBOL(sysctl_local_port_range);
2301 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2302 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2303