SAFE public projects git trees. - safe/jmp/linux-2.6/blob - net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      request_sock handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen sematics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/tcp.h>
  68 #include <net/ipv6.h>
  69 #include <net/inet_common.h>
  70 #include <net/xfrm.h>
  71
  72 #include <linux/inet.h>
  73 #include <linux/ipv6.h>
  74 #include <linux/stddef.h>
  75 #include <linux/proc_fs.h>
  76 #include <linux/seq_file.h>
  77
  78 extern int sysctl_ip_dynaddr;
  79 int sysctl_tcp_tw_reuse;
  80 int sysctl_tcp_low_latency;
  81
  82 /* Check TCP sequence numbers in ICMP packets. */
  83 #define ICMP_MIN_LENGTH 8
  84
  85 /* Socket used for sending RSTs */
  86 static struct socket *tcp_socket;
  87
  88 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  89                        struct sk_buff *skb);
  90
  91 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
  92         .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
  93         .__tcp_lhash_users      =       ATOMIC_INIT(0),
  94         .__tcp_lhash_wait
  95           = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
  96         .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
  97 };
  98
  99 /*
 100  * This array holds the first and last local port number.
 101  * For high-usage systems, use sysctl to change this to
 102  * 32768-61000
 103  */
 104 int sysctl_local_port_range[2] = { 1024, 4999 };
 105 int tcp_port_rover = 1024 - 1;
 106
 107 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 108                                  __u32 faddr, __u16 fport)
 109 {
 110         int h = (laddr ^ lport) ^ (faddr ^ fport);
 111         h ^= h >> 16;
 112         h ^= h >> 8;
 113         return h & (tcp_ehash_size - 1);
 114 }
 115
 116 static __inline__ int tcp_sk_hashfn(struct sock *sk)
 117 {
 118         struct inet_sock *inet = inet_sk(sk);
 119         __u32 laddr = inet->rcv_saddr;
 120         __u16 lport = inet->num;
 121         __u32 faddr = inet->daddr;
 122         __u16 fport = inet->dport;
 123
 124         return tcp_hashfn(laddr, lport, faddr, fport);
 125 }
 126
 127 /* Allocate and initialize a new TCP local port bind bucket.
 128  * The bindhash mutex for snum's hash chain must be held here.
 129  */
 130 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
 131                                           unsigned short snum)
 132 {
 133         struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
 134                                                       SLAB_ATOMIC);
 135         if (tb) {
 136                 tb->port = snum;
 137                 tb->fastreuse = 0;
 138                 INIT_HLIST_HEAD(&tb->owners);
 139                 hlist_add_head(&tb->node, &head->chain);
 140         }
 141         return tb;
 142 }
 143
 144 /* Caller must hold hashbucket lock for this tb with local BH disabled */
 145 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
 146 {
 147         if (hlist_empty(&tb->owners)) {
 148                 __hlist_del(&tb->node);
 149                 kmem_cache_free(tcp_bucket_cachep, tb);
 150         }
 151 }
 152
 153 /* Caller must disable local BH processing. */
 154 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
 155 {
 156         struct tcp_bind_hashbucket *head =
 157                                 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
 158         struct tcp_bind_bucket *tb;
 159
 160         spin_lock(&head->lock);
 161         tb = tcp_sk(sk)->bind_hash;
 162         sk_add_bind_node(child, &tb->owners);
 163         tcp_sk(child)->bind_hash = tb;
 164         spin_unlock(&head->lock);
 165 }
 166
 167 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
 168 {
 169         local_bh_disable();
 170         __tcp_inherit_port(sk, child);
 171         local_bh_enable();
 172 }
 173
 174 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
 175                    unsigned short snum)
 176 {
 177         inet_sk(sk)->num = snum;
 178         sk_add_bind_node(sk, &tb->owners);
 179         tcp_sk(sk)->bind_hash = tb;
 180 }
 181
 182 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
 183 {
 184         const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
 185         struct sock *sk2;
 186         struct hlist_node *node;
 187         int reuse = sk->sk_reuse;
 188
 189         sk_for_each_bound(sk2, node, &tb->owners) {
 190                 if (sk != sk2 &&
 191                     !tcp_v6_ipv6only(sk2) &&
 192                     (!sk->sk_bound_dev_if ||
 193                      !sk2->sk_bound_dev_if ||
 194                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
 195                         if (!reuse || !sk2->sk_reuse ||
 196                             sk2->sk_state == TCP_LISTEN) {
 197                                 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
 198                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
 199                                     sk2_rcv_saddr == sk_rcv_saddr)
 200                                         break;
 201                         }
 202                 }
 203         }
 204         return node != NULL;
 205 }
 206
 207 /* Obtain a reference to a local port for the given sock,
 208  * if snum is zero it means select any available local port.
 209  */
 210 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 211 {
 212         struct tcp_bind_hashbucket *head;
 213         struct hlist_node *node;
 214         struct tcp_bind_bucket *tb;
 215         int ret;
 216
 217         local_bh_disable();
 218         if (!snum) {
 219                 int low = sysctl_local_port_range[0];
 220                 int high = sysctl_local_port_range[1];
 221                 int remaining = (high - low) + 1;
 222                 int rover;
 223
 224                 spin_lock(&tcp_portalloc_lock);
 225                 if (tcp_port_rover < low)
 226                         rover = low;
 227                 else
 228                         rover = tcp_port_rover;
 229                 do {
 230                         rover++;
 231                         if (rover > high)
 232                                 rover = low;
 233                         head = &tcp_bhash[tcp_bhashfn(rover)];
 234                         spin_lock(&head->lock);
 235                         tb_for_each(tb, node, &head->chain)
 236                                 if (tb->port == rover)
 237                                         goto next;
 238                         break;
 239                 next:
 240                         spin_unlock(&head->lock);
 241                 } while (--remaining > 0);
 242                 tcp_port_rover = rover;
 243                 spin_unlock(&tcp_portalloc_lock);
 244
 245                 /* Exhausted local port range during search?  It is not
 246                  * possible for us to be holding one of the bind hash
 247                  * locks if this test triggers, because if 'remaining'
 248                  * drops to zero, we broke out of the do/while loop at
 249                  * the top level, not from the 'break;' statement.
 250                  */
 251                 ret = 1;
 252                 if (unlikely(remaining <= 0))
 253                         goto fail;
 254
 255                 /* OK, here is the one we will use.  HEAD is
 256                  * non-NULL and we hold it's mutex.
 257                  */
 258                 snum = rover;
 259         } else {
 260                 head = &tcp_bhash[tcp_bhashfn(snum)];
 261                 spin_lock(&head->lock);
 262                 tb_for_each(tb, node, &head->chain)
 263                         if (tb->port == snum)
 264                                 goto tb_found;
 265         }
 266         tb = NULL;
 267         goto tb_not_found;
 268 tb_found:
 269         if (!hlist_empty(&tb->owners)) {
 270                 if (sk->sk_reuse > 1)
 271                         goto success;
 272                 if (tb->fastreuse > 0 &&
 273                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
 274                         goto success;
 275                 } else {
 276                         ret = 1;
 277                         if (tcp_bind_conflict(sk, tb))
 278                                 goto fail_unlock;
 279                 }
 280         }
 281 tb_not_found:
 282         ret = 1;
 283         if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
 284                 goto fail_unlock;
 285         if (hlist_empty(&tb->owners)) {
 286                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 287                         tb->fastreuse = 1;
 288                 else
 289                         tb->fastreuse = 0;
 290         } else if (tb->fastreuse &&
 291                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 292                 tb->fastreuse = 0;
 293 success:
 294         if (!tcp_sk(sk)->bind_hash)
 295                 tcp_bind_hash(sk, tb, snum);
 296         BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
 297         ret = 0;
 298
 299 fail_unlock:
 300         spin_unlock(&head->lock);
 301 fail:
 302         local_bh_enable();
 303         return ret;
 304 }
 305
 306 /* Get rid of any references to a local port held by the
 307  * given sock.
 308  */
 309 static void __tcp_put_port(struct sock *sk)
 310 {
 311         struct inet_sock *inet = inet_sk(sk);
 312         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
 313         struct tcp_bind_bucket *tb;
 314
 315         spin_lock(&head->lock);
 316         tb = tcp_sk(sk)->bind_hash;
 317         __sk_del_bind_node(sk);
 318         tcp_sk(sk)->bind_hash = NULL;
 319         inet->num = 0;
 320         tcp_bucket_destroy(tb);
 321         spin_unlock(&head->lock);
 322 }
 323
 324 void tcp_put_port(struct sock *sk)
 325 {
 326         local_bh_disable();
 327         __tcp_put_port(sk);
 328         local_bh_enable();
 329 }
 330
 331 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
 332  * Look, when several writers sleep and reader wakes them up, all but one
 333  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 334  * this, _but_ remember, it adds useless work on UP machines (wake up each
 335  * exclusive lock release). It should be ifdefed really.
 336  */
 337
 338 void tcp_listen_wlock(void)
 339 {
 340         write_lock(&tcp_lhash_lock);
 341
 342         if (atomic_read(&tcp_lhash_users)) {
 343                 DEFINE_WAIT(wait);
 344
 345                 for (;;) {
 346                         prepare_to_wait_exclusive(&tcp_lhash_wait,
 347                                                 &wait, TASK_UNINTERRUPTIBLE);
 348                         if (!atomic_read(&tcp_lhash_users))
 349                                 break;
 350                         write_unlock_bh(&tcp_lhash_lock);
 351                         schedule();
 352                         write_lock_bh(&tcp_lhash_lock);
 353                 }
 354
 355                 finish_wait(&tcp_lhash_wait, &wait);
 356         }
 357 }
 358
 359 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
 360 {
 361         struct hlist_head *list;
 362         rwlock_t *lock;
 363
 364         BUG_TRAP(sk_unhashed(sk));
 365         if (listen_possible && sk->sk_state == TCP_LISTEN) {
 366                 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 367                 lock = &tcp_lhash_lock;
 368                 tcp_listen_wlock();
 369         } else {
 370                 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
 371                 lock = &tcp_ehash[sk->sk_hashent].lock;
 372                 write_lock(lock);
 373         }
 374         __sk_add_node(sk, list);
 375         sock_prot_inc_use(sk->sk_prot);
 376         write_unlock(lock);
 377         if (listen_possible && sk->sk_state == TCP_LISTEN)
 378                 wake_up(&tcp_lhash_wait);
 379 }
 380
 381 static void tcp_v4_hash(struct sock *sk)
 382 {
 383         if (sk->sk_state != TCP_CLOSE) {
 384                 local_bh_disable();
 385                 __tcp_v4_hash(sk, 1);
 386                 local_bh_enable();
 387         }
 388 }
 389
 390 void tcp_unhash(struct sock *sk)
 391 {
 392         rwlock_t *lock;
 393
 394         if (sk_unhashed(sk))
 395                 goto ende;
 396
 397         if (sk->sk_state == TCP_LISTEN) {
 398                 local_bh_disable();
 399                 tcp_listen_wlock();
 400                 lock = &tcp_lhash_lock;
 401         } else {
 402                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
 403                 lock = &head->lock;
 404                 write_lock_bh(&head->lock);
 405         }
 406
 407         if (__sk_del_node_init(sk))
 408                 sock_prot_dec_use(sk->sk_prot);
 409         write_unlock_bh(lock);
 410
 411  ende:
 412         if (sk->sk_state == TCP_LISTEN)
 413                 wake_up(&tcp_lhash_wait);
 414 }
 415
 416 /* Don't inline this cruft.  Here are some nice properties to
 417  * exploit here.  The BSD API does not allow a listening TCP
 418  * to specify the remote port nor the remote address for the
 419  * connection.  So always assume those are both wildcarded
 420  * during the search since they can never be otherwise.
 421  */
 422 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
 423                                              unsigned short hnum, int dif)
 424 {
 425         struct sock *result = NULL, *sk;
 426         struct hlist_node *node;
 427         int score, hiscore;
 428
 429         hiscore=-1;
 430         sk_for_each(sk, node, head) {
 431                 struct inet_sock *inet = inet_sk(sk);
 432
 433                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
 434                         __u32 rcv_saddr = inet->rcv_saddr;
 435
 436                         score = (sk->sk_family == PF_INET ? 1 : 0);
 437                         if (rcv_saddr) {
 438                                 if (rcv_saddr != daddr)
 439                                         continue;
 440                                 score+=2;
 441                         }
 442                         if (sk->sk_bound_dev_if) {
 443                                 if (sk->sk_bound_dev_if != dif)
 444                                         continue;
 445                                 score+=2;
 446                         }
 447                         if (score == 5)
 448                                 return sk;
 449                         if (score > hiscore) {
 450                                 hiscore = score;
 451                                 result = sk;
 452                         }
 453                 }
 454         }
 455         return result;
 456 }
 457
 458 /* Optimize the common listener case. */
 459 static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
 460                 unsigned short hnum, int dif)
 461 {
 462         struct sock *sk = NULL;
 463         struct hlist_head *head;
 464
 465         read_lock(&tcp_lhash_lock);
 466         head = &tcp_listening_hash[tcp_lhashfn(hnum)];
 467         if (!hlist_empty(head)) {
 468                 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
 469
 470                 if (inet->num == hnum && !sk->sk_node.next &&
 471                     (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
 472                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
 473                     !sk->sk_bound_dev_if)
 474                         goto sherry_cache;
 475                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
 476         }
 477         if (sk) {
 478 sherry_cache:
 479                 sock_hold(sk);
 480         }
 481         read_unlock(&tcp_lhash_lock);
 482         return sk;
 483 }
 484
 485 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 486  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 487  *
 488  * Local BH must be disabled here.
 489  */
 490
 491 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
 492                                                        u32 daddr, u16 hnum,
 493                                                        int dif)
 494 {
 495         struct tcp_ehash_bucket *head;
 496         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 497         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 498         struct sock *sk;
 499         struct hlist_node *node;
 500         /* Optimize here for direct hit, only listening connections can
 501          * have wildcards anyways.
 502          */
 503         int hash = tcp_hashfn(daddr, hnum, saddr, sport);
 504         head = &tcp_ehash[hash];
 505         read_lock(&head->lock);
 506         sk_for_each(sk, node, &head->chain) {
 507                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 508                         goto hit; /* You sunk my battleship! */
 509         }
 510
 511         /* Must check for a TIME_WAIT'er before going to listener hash. */
 512         sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
 513                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
 514                         goto hit;
 515         }
 516         sk = NULL;
 517 out:
 518         read_unlock(&head->lock);
 519         return sk;
 520 hit:
 521         sock_hold(sk);
 522         goto out;
 523 }
 524
 525 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 526                                            u32 daddr, u16 hnum, int dif)
 527 {
 528         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
 529                                                       daddr, hnum, dif);
 530
 531         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
 532 }
 533
 534 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
 535                                   u16 dport, int dif)
 536 {
 537         struct sock *sk;
 538
 539         local_bh_disable();
 540         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 541         local_bh_enable();
 542
 543         return sk;
 544 }
 545
 546 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
 547
 548 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 549 {
 550         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 551                                           skb->nh.iph->saddr,
 552                                           skb->h.th->dest,
 553                                           skb->h.th->source);
 554 }
 555
 556 /* called with local bh disabled */
 557 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 558                                       struct tcp_tw_bucket **twp)
 559 {
 560         struct inet_sock *inet = inet_sk(sk);
 561         u32 daddr = inet->rcv_saddr;
 562         u32 saddr = inet->daddr;
 563         int dif = sk->sk_bound_dev_if;
 564         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 565         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
 566         int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
 567         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 568         struct sock *sk2;
 569         struct hlist_node *node;
 570         struct tcp_tw_bucket *tw;
 571
 572         write_lock(&head->lock);
 573
 574         /* Check TIME-WAIT sockets first. */
 575         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
 576                 tw = (struct tcp_tw_bucket *)sk2;
 577
 578                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 579                         struct tcp_sock *tp = tcp_sk(sk);
 580
 581                         /* With PAWS, it is safe from the viewpoint
 582                            of data integrity. Even without PAWS it
 583                            is safe provided sequence spaces do not
 584                            overlap i.e. at data rates <= 80Mbit/sec.
 585
 586                            Actually, the idea is close to VJ's one,
 587                            only timestamp cache is held not per host,
 588                            but per port pair and TW bucket is used
 589                            as state holder.
 590
 591                            If TW bucket has been already destroyed we
 592                            fall back to VJ's scheme and use initial
 593                            timestamp retrieved from peer table.
 594                          */
 595                         if (tw->tw_ts_recent_stamp &&
 596                             (!twp || (sysctl_tcp_tw_reuse &&
 597                                       xtime.tv_sec -
 598                                       tw->tw_ts_recent_stamp > 1))) {
 599                                 if ((tp->write_seq =
 600                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
 601                                         tp->write_seq = 1;
 602                                 tp->rx_opt.ts_recent       = tw->tw_ts_recent;
 603                                 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
 604                                 sock_hold(sk2);
 605                                 goto unique;
 606                         } else
 607                                 goto not_unique;
 608                 }
 609         }
 610         tw = NULL;
 611
 612         /* And established part... */
 613         sk_for_each(sk2, node, &head->chain) {
 614                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 615                         goto not_unique;
 616         }
 617
 618 unique:
 619         /* Must record num and sport now. Otherwise we will see
 620          * in hash table socket with a funny identity. */
 621         inet->num = lport;
 622         inet->sport = htons(lport);
 623         sk->sk_hashent = hash;
 624         BUG_TRAP(sk_unhashed(sk));
 625         __sk_add_node(sk, &head->chain);
 626         sock_prot_inc_use(sk->sk_prot);
 627         write_unlock(&head->lock);
 628
 629         if (twp) {
 630                 *twp = tw;
 631                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 632         } else if (tw) {
 633                 /* Silly. Should hash-dance instead... */
 634                 tcp_tw_deschedule(tw);
 635                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 636
 637                 tcp_tw_put(tw);
 638         }
 639
 640         return 0;
 641
 642 not_unique:
 643         write_unlock(&head->lock);
 644         return -EADDRNOTAVAIL;
 645 }
 646
 647 static inline u32 connect_port_offset(const struct sock *sk)
 648 {
 649         const struct inet_sock *inet = inet_sk(sk);
 650
 651         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
 652                                          inet->dport);
 653 }
 654
 655 /*
 656  * Bind a port for a connect operation and hash it.
 657  */
 658 static inline int tcp_v4_hash_connect(struct sock *sk)
 659 {
 660         unsigned short snum = inet_sk(sk)->num;
 661         struct tcp_bind_hashbucket *head;
 662         struct tcp_bind_bucket *tb;
 663         int ret;
 664
 665         if (!snum) {
 666                 int low = sysctl_local_port_range[0];
 667                 int high = sysctl_local_port_range[1];
 668                 int range = high - low;
 669                 int i;
 670                 int port;
 671                 static u32 hint;
 672                 u32 offset = hint + connect_port_offset(sk);
 673                 struct hlist_node *node;
 674                 struct tcp_tw_bucket *tw = NULL;
 675
 676                 local_bh_disable();
 677                 for (i = 1; i <= range; i++) {
 678                         port = low + (i + offset) % range;
 679                         head = &tcp_bhash[tcp_bhashfn(port)];
 680                         spin_lock(&head->lock);
 681
 682                         /* Does not bother with rcv_saddr checks,
 683                          * because the established check is already
 684                          * unique enough.
 685                          */
 686                         tb_for_each(tb, node, &head->chain) {
 687                                 if (tb->port == port) {
 688                                         BUG_TRAP(!hlist_empty(&tb->owners));
 689                                         if (tb->fastreuse >= 0)
 690                                                 goto next_port;
 691                                         if (!__tcp_v4_check_established(sk,
 692                                                                         port,
 693                                                                         &tw))
 694                                                 goto ok;
 695                                         goto next_port;
 696                                 }
 697                         }
 698
 699                         tb = tcp_bucket_create(head, port);
 700                         if (!tb) {
 701                                 spin_unlock(&head->lock);
 702                                 break;
 703                         }
 704                         tb->fastreuse = -1;
 705                         goto ok;
 706
 707                 next_port:
 708                         spin_unlock(&head->lock);
 709                 }
 710                 local_bh_enable();
 711
 712                 return -EADDRNOTAVAIL;
 713
 714 ok:
 715                 hint += i;
 716
 717                 /* Head lock still held and bh's disabled */
 718                 tcp_bind_hash(sk, tb, port);
 719                 if (sk_unhashed(sk)) {
 720                         inet_sk(sk)->sport = htons(port);
 721                         __tcp_v4_hash(sk, 0);
 722                 }
 723                 spin_unlock(&head->lock);
 724
 725                 if (tw) {
 726                         tcp_tw_deschedule(tw);
 727                         tcp_tw_put(tw);
 728                 }
 729
 730                 ret = 0;
 731                 goto out;
 732         }
 733
 734         head  = &tcp_bhash[tcp_bhashfn(snum)];
 735         tb  = tcp_sk(sk)->bind_hash;
 736         spin_lock_bh(&head->lock);
 737         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 738                 __tcp_v4_hash(sk, 0);
 739                 spin_unlock_bh(&head->lock);
 740                 return 0;
 741         } else {
 742                 spin_unlock(&head->lock);
 743                 /* No definite answer... Walk to established hash table */
 744                 ret = __tcp_v4_check_established(sk, snum, NULL);
 745 out:
 746                 local_bh_enable();
 747                 return ret;
 748         }
 749 }
 750
 751 /* This will initiate an outgoing connection. */
 752 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 753 {
 754         struct inet_sock *inet = inet_sk(sk);
 755         struct tcp_sock *tp = tcp_sk(sk);
 756         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 757         struct rtable *rt;
 758         u32 daddr, nexthop;
 759         int tmp;
 760         int err;
 761
 762         if (addr_len < sizeof(struct sockaddr_in))
 763                 return -EINVAL;
 764
 765         if (usin->sin_family != AF_INET)
 766                 return -EAFNOSUPPORT;
 767
 768         nexthop = daddr = usin->sin_addr.s_addr;
 769         if (inet->opt && inet->opt->srr) {
 770                 if (!daddr)
 771                         return -EINVAL;
 772                 nexthop = inet->opt->faddr;
 773         }
 774
 775         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 776                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 777                                IPPROTO_TCP,
 778                                inet->sport, usin->sin_port, sk);
 779         if (tmp < 0)
 780                 return tmp;
 781
 782         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 783                 ip_rt_put(rt);
 784                 return -ENETUNREACH;
 785         }
 786
 787         if (!inet->opt || !inet->opt->srr)
 788                 daddr = rt->rt_dst;
 789
 790         if (!inet->saddr)
 791                 inet->saddr = rt->rt_src;
 792         inet->rcv_saddr = inet->saddr;
 793
 794         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 795                 /* Reset inherited state */
 796                 tp->rx_opt.ts_recent       = 0;
 797                 tp->rx_opt.ts_recent_stamp = 0;
 798                 tp->write_seq              = 0;
 799         }
 800
 801         if (sysctl_tcp_tw_recycle &&
 802             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 803                 struct inet_peer *peer = rt_get_peer(rt);
 804
 805                 /* VJ's idea. We save last timestamp seen from
 806                  * the destination in peer table, when entering state TIME-WAIT
 807                  * and initialize rx_opt.ts_recent from it, when trying new connection.
 808                  */
 809
 810                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 811                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 812                         tp->rx_opt.ts_recent = peer->tcp_ts;
 813                 }
 814         }
 815
 816         inet->dport = usin->sin_port;
 817         inet->daddr = daddr;
 818
 819         tp->ext_header_len = 0;
 820         if (inet->opt)
 821                 tp->ext_header_len = inet->opt->optlen;
 822
 823         tp->rx_opt.mss_clamp = 536;
 824
 825         /* Socket identity is still unknown (sport may be zero).
 826          * However we set state to SYN-SENT and not releasing socket
 827          * lock select source port, enter ourselves into the hash tables and
 828          * complete initialization after this.
 829          */
 830         tcp_set_state(sk, TCP_SYN_SENT);
 831         err = tcp_v4_hash_connect(sk);
 832         if (err)
 833                 goto failure;
 834
 835         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 836         if (err)
 837                 goto failure;
 838
 839         /* OK, now commit destination to socket.  */
 840         sk_setup_caps(sk, &rt->u.dst);
 841
 842         if (!tp->write_seq)
 843                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 844                                                            inet->daddr,
 845                                                            inet->sport,
 846                                                            usin->sin_port);
 847
 848         inet->id = tp->write_seq ^ jiffies;
 849
 850         err = tcp_connect(sk);
 851         rt = NULL;
 852         if (err)
 853                 goto failure;
 854
 855         return 0;
 856
 857 failure:
 858         /* This unhashes the socket and releases the local port, if necessary. */
 859         tcp_set_state(sk, TCP_CLOSE);
 860         ip_rt_put(rt);
 861         sk->sk_route_caps = 0;
 862         inet->dport = 0;
 863         return err;
 864 }
 865
 866 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 867 {
 868         return ((struct rtable *)skb->dst)->rt_iif;
 869 }
 870
 871 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
 872 {
 873         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
 874 }
 875
 876 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
 877                                               struct request_sock ***prevp,
 878                                               __u16 rport,
 879                                               __u32 raddr, __u32 laddr)
 880 {
 881         struct listen_sock *lopt = tp->accept_queue.listen_opt;
 882         struct request_sock *req, **prev;
 883
 884         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
 885              (req = *prev) != NULL;
 886              prev = &req->dl_next) {
 887                 const struct inet_request_sock *ireq = inet_rsk(req);
 888
 889                 if (ireq->rmt_port == rport &&
 890                     ireq->rmt_addr == raddr &&
 891                     ireq->loc_addr == laddr &&
 892                     TCP_INET_FAMILY(req->rsk_ops->family)) {
 893                         BUG_TRAP(!req->sk);
 894                         *prevp = prev;
 895                         break;
 896                 }
 897         }
 898
 899         return req;
 900 }
 901
 902 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
 903 {
 904         struct tcp_sock *tp = tcp_sk(sk);
 905         struct listen_sock *lopt = tp->accept_queue.listen_opt;
 906         u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
 907
 908         reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
 909         tcp_synq_added(sk);
 910 }
 911
 912
 913 /*
 914  * This routine does path mtu discovery as defined in RFC1191.
 915  */
 916 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 917                                      u32 mtu)
 918 {
 919         struct dst_entry *dst;
 920         struct inet_sock *inet = inet_sk(sk);
 921         struct tcp_sock *tp = tcp_sk(sk);
 922
 923         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 924          * send out by Linux are always <576bytes so they should go through
 925          * unfragmented).
 926          */
 927         if (sk->sk_state == TCP_LISTEN)
 928                 return;
 929
 930         /* We don't check in the destentry if pmtu discovery is forbidden
 931          * on this route. We just assume that no packet_to_big packets
 932          * are send back when pmtu discovery is not active.
 933          * There is a small race when the user changes this flag in the
 934          * route, but I think that's acceptable.
 935          */
 936         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 937                 return;
 938
 939         dst->ops->update_pmtu(dst, mtu);
 940
 941         /* Something is about to be wrong... Remember soft error
 942          * for the case, if this connection will not able to recover.
 943          */
 944         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 945                 sk->sk_err_soft = EMSGSIZE;
 946
 947         mtu = dst_mtu(dst);
 948
 949         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 950             tp->pmtu_cookie > mtu) {
 951                 tcp_sync_mss(sk, mtu);
 952
 953                 /* Resend the TCP packet because it's
 954                  * clear that the old packet has been
 955                  * dropped. This is the new "fast" path mtu
 956                  * discovery.
 957                  */
 958                 tcp_simple_retransmit(sk);
 959         } /* else let the usual retransmit timer handle it */
 960 }
 961
 962 /*
 963  * This routine is called by the ICMP module when it gets some
 964  * sort of error condition.  If err < 0 then the socket should
 965  * be closed and the error returned to the user.  If err > 0
 966  * it's just the icmp type << 8 | icmp code.  After adjustment
 967  * header points to the first 8 bytes of the tcp header.  We need
 968  * to find the appropriate port.
 969  *
 970  * The locking strategy used here is very "optimistic". When
 971  * someone else accesses the socket the ICMP is just dropped
 972  * and for some paths there is no check at all.
 973  * A more general error queue to queue errors for later handling
 974  * is probably better.
 975  *
 976  */
 977
 978 void tcp_v4_err(struct sk_buff *skb, u32 info)
 979 {
 980         struct iphdr *iph = (struct iphdr *)skb->data;
 981         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 982         struct tcp_sock *tp;
 983         struct inet_sock *inet;
 984         int type = skb->h.icmph->type;
 985         int code = skb->h.icmph->code;
 986         struct sock *sk;
 987         __u32 seq;
 988         int err;
 989
 990         if (skb->len < (iph->ihl << 2) + 8) {
 991                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 992                 return;
 993         }
 994
 995         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
 996                            th->source, tcp_v4_iif(skb));
 997         if (!sk) {
 998                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 999                 return;
1000         }
1001         if (sk->sk_state == TCP_TIME_WAIT) {
1002                 tcp_tw_put((struct tcp_tw_bucket *)sk);
1003                 return;
1004         }
1005
1006         bh_lock_sock(sk);
1007         /* If too many ICMPs get dropped on busy
1008          * servers this needs to be solved differently.
1009          */
1010         if (sock_owned_by_user(sk))
1011                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1012
1013         if (sk->sk_state == TCP_CLOSE)
1014                 goto out;
1015
1016         tp = tcp_sk(sk);
1017         seq = ntohl(th->seq);
1018         if (sk->sk_state != TCP_LISTEN &&
1019             !between(seq, tp->snd_una, tp->snd_nxt)) {
1020                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1021                 goto out;
1022         }
1023
1024         switch (type) {
1025         case ICMP_SOURCE_QUENCH:
1026                 /* Just silently ignore these. */
1027                 goto out;
1028         case ICMP_PARAMETERPROB:
1029                 err = EPROTO;
1030                 break;
1031         case ICMP_DEST_UNREACH:
1032                 if (code > NR_ICMP_UNREACH)
1033                         goto out;
1034
1035                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1036                         if (!sock_owned_by_user(sk))
1037                                 do_pmtu_discovery(sk, iph, info);
1038                         goto out;
1039                 }
1040
1041                 err = icmp_err_convert[code].errno;
1042                 break;
1043         case ICMP_TIME_EXCEEDED:
1044                 err = EHOSTUNREACH;
1045                 break;
1046         default:
1047                 goto out;
1048         }
1049
1050         switch (sk->sk_state) {
1051                 struct request_sock *req, **prev;
1052         case TCP_LISTEN:
1053                 if (sock_owned_by_user(sk))
1054                         goto out;
1055
1056                 req = tcp_v4_search_req(tp, &prev, th->dest,
1057                                         iph->daddr, iph->saddr);
1058                 if (!req)
1059                         goto out;
1060
1061                 /* ICMPs are not backlogged, hence we cannot get
1062                    an established socket here.
1063                  */
1064                 BUG_TRAP(!req->sk);
1065
1066                 if (seq != tcp_rsk(req)->snt_isn) {
1067                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1068                         goto out;
1069                 }
1070
1071                 /*
1072                  * Still in SYN_RECV, just remove it silently.
1073                  * There is no good way to pass the error to the newly
1074                  * created socket, and POSIX does not want network
1075                  * errors returned from accept().
1076                  */
1077                 tcp_synq_drop(sk, req, prev);
1078                 goto out;
1079
1080         case TCP_SYN_SENT:
1081         case TCP_SYN_RECV:  /* Cannot happen.
1082                                It can f.e. if SYNs crossed.
1083                              */
1084                 if (!sock_owned_by_user(sk)) {
1085                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1086                         sk->sk_err = err;
1087
1088                         sk->sk_error_report(sk);
1089
1090                         tcp_done(sk);
1091                 } else {
1092                         sk->sk_err_soft = err;
1093                 }
1094                 goto out;
1095         }
1096
1097         /* If we've already connected we will keep trying
1098          * until we time out, or the user gives up.
1099          *
1100          * rfc1122 4.2.3.9 allows to consider as hard errors
1101          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1102          * but it is obsoleted by pmtu discovery).
1103          *
1104          * Note, that in modern internet, where routing is unreliable
1105          * and in each dark corner broken firewalls sit, sending random
1106          * errors ordered by their masters even this two messages finally lose
1107          * their original sense (even Linux sends invalid PORT_UNREACHs)
1108          *
1109          * Now we are in compliance with RFCs.
1110          *                                                      --ANK (980905)
1111          */
1112
1113         inet = inet_sk(sk);
1114         if (!sock_owned_by_user(sk) && inet->recverr) {
1115                 sk->sk_err = err;
1116                 sk->sk_error_report(sk);
1117         } else  { /* Only an error on timeout */
1118                 sk->sk_err_soft = err;
1119         }
1120
1121 out:
1122         bh_unlock_sock(sk);
1123         sock_put(sk);
1124 }
1125
1126 /* This routine computes an IPv4 TCP checksum. */
1127 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1128                        struct sk_buff *skb)
1129 {
1130         struct inet_sock *inet = inet_sk(sk);
1131
1132         if (skb->ip_summed == CHECKSUM_HW) {
1133                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1134                 skb->csum = offsetof(struct tcphdr, check);
1135         } else {
1136                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1137                                          csum_partial((char *)th,
1138                                                       th->doff << 2,
1139                                                       skb->csum));
1140         }
1141 }
1142
1143 /*
1144  *      This routine will send an RST to the other tcp.
1145  *
1146  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1147  *                    for reset.
1148  *      Answer: if a packet caused RST, it is not for a socket
1149  *              existing in our system, if it is matched to a socket,
1150  *              it is just duplicate segment or bug in other side's TCP.
1151  *              So that we build reply only basing on parameters
1152  *              arrived with segment.
1153  *      Exception: precedence violation. We do not implement it in any case.
1154  */
1155
1156 static void tcp_v4_send_reset(struct sk_buff *skb)
1157 {
1158         struct tcphdr *th = skb->h.th;
1159         struct tcphdr rth;
1160         struct ip_reply_arg arg;
1161
1162         /* Never send a reset in response to a reset. */
1163         if (th->rst)
1164                 return;
1165
1166         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1167                 return;
1168
1169         /* Swap the send and the receive. */
1170         memset(&rth, 0, sizeof(struct tcphdr));
1171         rth.dest   = th->source;
1172         rth.source = th->dest;
1173         rth.doff   = sizeof(struct tcphdr) / 4;
1174         rth.rst    = 1;
1175
1176         if (th->ack) {
1177                 rth.seq = th->ack_seq;
1178         } else {
1179                 rth.ack = 1;
1180                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1181                                     skb->len - (th->doff << 2));
1182         }
1183
1184         memset(&arg, 0, sizeof arg);
1185         arg.iov[0].iov_base = (unsigned char *)&rth;
1186         arg.iov[0].iov_len  = sizeof rth;
1187         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1188                                       skb->nh.iph->saddr, /*XXX*/
1189                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1190         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1191
1192         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1193
1194         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1195         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1196 }
1197
1198 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1199    outside socket context is ugly, certainly. What can I do?
1200  */
1201
1202 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1203                             u32 win, u32 ts)
1204 {
1205         struct tcphdr *th = skb->h.th;
1206         struct {
1207                 struct tcphdr th;
1208                 u32 tsopt[3];
1209         } rep;
1210         struct ip_reply_arg arg;
1211
1212         memset(&rep.th, 0, sizeof(struct tcphdr));
1213         memset(&arg, 0, sizeof arg);
1214
1215         arg.iov[0].iov_base = (unsigned char *)&rep;
1216         arg.iov[0].iov_len  = sizeof(rep.th);
1217         if (ts) {
1218                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1219                                      (TCPOPT_TIMESTAMP << 8) |
1220                                      TCPOLEN_TIMESTAMP);
1221                 rep.tsopt[1] = htonl(tcp_time_stamp);
1222                 rep.tsopt[2] = htonl(ts);
1223                 arg.iov[0].iov_len = sizeof(rep);
1224         }
1225
1226         /* Swap the send and the receive. */
1227         rep.th.dest    = th->source;
1228         rep.th.source  = th->dest;
1229         rep.th.doff    = arg.iov[0].iov_len / 4;
1230         rep.th.seq     = htonl(seq);
1231         rep.th.ack_seq = htonl(ack);
1232         rep.th.ack     = 1;
1233         rep.th.window  = htons(win);
1234
1235         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1236                                       skb->nh.iph->saddr, /*XXX*/
1237                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1238         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1239
1240         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1241
1242         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1243 }
1244
1245 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1246 {
1247         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1248
1249         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1250                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1251
1252         tcp_tw_put(tw);
1253 }
1254
1255 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1256 {
1257         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1258                         req->ts_recent);
1259 }
1260
1261 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1262                                           struct request_sock *req)
1263 {
1264         struct rtable *rt;
1265         const struct inet_request_sock *ireq = inet_rsk(req);
1266         struct ip_options *opt = inet_rsk(req)->opt;
1267         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1268                             .nl_u = { .ip4_u =
1269                                       { .daddr = ((opt && opt->srr) ?
1270                                                   opt->faddr :
1271                                                   ireq->rmt_addr),
1272                                         .saddr = ireq->loc_addr,
1273                                         .tos = RT_CONN_FLAGS(sk) } },
1274                             .proto = IPPROTO_TCP,
1275                             .uli_u = { .ports =
1276                                        { .sport = inet_sk(sk)->sport,
1277                                          .dport = ireq->rmt_port } } };
1278
1279         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1280                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1281                 return NULL;
1282         }
1283         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1284                 ip_rt_put(rt);
1285                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1286                 return NULL;
1287         }
1288         return &rt->u.dst;
1289 }
1290
1291 /*
1292  *      Send a SYN-ACK after having received an ACK.
1293  *      This still operates on a request_sock only, not on a big
1294  *      socket.
1295  */
1296 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1297                               struct dst_entry *dst)
1298 {
1299         const struct inet_request_sock *ireq = inet_rsk(req);
1300         int err = -1;
1301         struct sk_buff * skb;
1302
1303         /* First, grab a route. */
1304         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1305                 goto out;
1306
1307         skb = tcp_make_synack(sk, dst, req);
1308
1309         if (skb) {
1310                 struct tcphdr *th = skb->h.th;
1311
1312                 th->check = tcp_v4_check(th, skb->len,
1313                                          ireq->loc_addr,
1314                                          ireq->rmt_addr,
1315                                          csum_partial((char *)th, skb->len,
1316                                                       skb->csum));
1317
1318                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1319                                             ireq->rmt_addr,
1320                                             ireq->opt);
1321                 if (err == NET_XMIT_CN)
1322                         err = 0;
1323         }
1324
1325 out:
1326         dst_release(dst);
1327         return err;
1328 }
1329
1330 /*
1331  *      IPv4 request_sock destructor.
1332  */
1333 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1334 {
1335         if (inet_rsk(req)->opt)
1336                 kfree(inet_rsk(req)->opt);
1337 }
1338
1339 static inline void syn_flood_warning(struct sk_buff *skb)
1340 {
1341         static unsigned long warntime;
1342
1343         if (time_after(jiffies, (warntime + HZ * 60))) {
1344                 warntime = jiffies;
1345                 printk(KERN_INFO
1346                        "possible SYN flooding on port %d. Sending cookies.\n",
1347                        ntohs(skb->h.th->dest));
1348         }
1349 }
1350
1351 /*
1352  * Save and compile IPv4 options into the request_sock if needed.
1353  */
1354 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1355                                                      struct sk_buff *skb)
1356 {
1357         struct ip_options *opt = &(IPCB(skb)->opt);
1358         struct ip_options *dopt = NULL;
1359
1360         if (opt && opt->optlen) {
1361                 int opt_size = optlength(opt);
1362                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1363                 if (dopt) {
1364                         if (ip_options_echo(dopt, skb)) {
1365                                 kfree(dopt);
1366                                 dopt = NULL;
1367                         }
1368                 }
1369         }
1370         return dopt;
1371 }
1372
1373 struct request_sock_ops tcp_request_sock_ops = {
1374         .family         =       PF_INET,
1375         .obj_size       =       sizeof(struct tcp_request_sock),
1376         .rtx_syn_ack    =       tcp_v4_send_synack,
1377         .send_ack       =       tcp_v4_reqsk_send_ack,
1378         .destructor     =       tcp_v4_reqsk_destructor,
1379         .send_reset     =       tcp_v4_send_reset,
1380 };
1381
1382 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1383 {
1384         struct inet_request_sock *ireq;
1385         struct tcp_options_received tmp_opt;
1386         struct request_sock *req;
1387         __u32 saddr = skb->nh.iph->saddr;
1388         __u32 daddr = skb->nh.iph->daddr;
1389         __u32 isn = TCP_SKB_CB(skb)->when;
1390         struct dst_entry *dst = NULL;
1391 #ifdef CONFIG_SYN_COOKIES
1392         int want_cookie = 0;
1393 #else
1394 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1395 #endif
1396
1397         /* Never answer to SYNs send to broadcast or multicast */
1398         if (((struct rtable *)skb->dst)->rt_flags &
1399             (RTCF_BROADCAST | RTCF_MULTICAST))
1400                 goto drop;
1401
1402         /* TW buckets are converted to open requests without
1403          * limitations, they conserve resources and peer is
1404          * evidently real one.
1405          */
1406         if (tcp_synq_is_full(sk) && !isn) {
1407 #ifdef CONFIG_SYN_COOKIES
1408                 if (sysctl_tcp_syncookies) {
1409                         want_cookie = 1;
1410                 } else
1411 #endif
1412                 goto drop;
1413         }
1414
1415         /* Accept backlog is full. If we have already queued enough
1416          * of warm entries in syn queue, drop request. It is better than
1417          * clogging syn queue with openreqs with exponentially increasing
1418          * timeout.
1419          */
1420         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1421                 goto drop;
1422
1423         req = reqsk_alloc(&tcp_request_sock_ops);
1424         if (!req)
1425                 goto drop;
1426
1427         tcp_clear_options(&tmp_opt);
1428         tmp_opt.mss_clamp = 536;
1429         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1430
1431         tcp_parse_options(skb, &tmp_opt, 0);
1432
1433         if (want_cookie) {
1434                 tcp_clear_options(&tmp_opt);
1435                 tmp_opt.saw_tstamp = 0;
1436         }
1437
1438         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1439                 /* Some OSes (unknown ones, but I see them on web server, which
1440                  * contains information interesting only for windows'
1441                  * users) do not send their stamp in SYN. It is easy case.
1442                  * We simply do not advertise TS support.
1443                  */
1444                 tmp_opt.saw_tstamp = 0;
1445                 tmp_opt.tstamp_ok  = 0;
1446         }
1447         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1448
1449         tcp_openreq_init(req, &tmp_opt, skb);
1450
1451         ireq = inet_rsk(req);
1452         ireq->loc_addr = daddr;
1453         ireq->rmt_addr = saddr;
1454         ireq->opt = tcp_v4_save_options(sk, skb);
1455         if (!want_cookie)
1456                 TCP_ECN_create_request(req, skb->h.th);
1457
1458         if (want_cookie) {
1459 #ifdef CONFIG_SYN_COOKIES
1460                 syn_flood_warning(skb);
1461 #endif
1462                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1463         } else if (!isn) {
1464                 struct inet_peer *peer = NULL;
1465
1466                 /* VJ's idea. We save last timestamp seen
1467                  * from the destination in peer table, when entering
1468                  * state TIME-WAIT, and check against it before
1469                  * accepting new connection request.
1470                  *
1471                  * If "isn" is not zero, this request hit alive
1472                  * timewait bucket, so that all the necessary checks
1473                  * are made in the function processing timewait state.
1474                  */
1475                 if (tmp_opt.saw_tstamp &&
1476                     sysctl_tcp_tw_recycle &&
1477                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1478                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1479                     peer->v4daddr == saddr) {
1480                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1481                             (s32)(peer->tcp_ts - req->ts_recent) >
1482                                                         TCP_PAWS_WINDOW) {
1483                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1484                                 dst_release(dst);
1485                                 goto drop_and_free;
1486                         }
1487                 }
1488                 /* Kill the following clause, if you dislike this way. */
1489                 else if (!sysctl_tcp_syncookies &&
1490                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1491                           (sysctl_max_syn_backlog >> 2)) &&
1492                          (!peer || !peer->tcp_ts_stamp) &&
1493                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1494                         /* Without syncookies last quarter of
1495                          * backlog is filled with destinations,
1496                          * proven to be alive.
1497                          * It means that we continue to communicate
1498                          * to destinations, already remembered
1499                          * to the moment of synflood.
1500                          */
1501                         LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1502                                               "request from %u.%u."
1503                                               "%u.%u/%u\n",
1504                                               NIPQUAD(saddr),
1505                                               ntohs(skb->h.th->source)));
1506                         dst_release(dst);
1507                         goto drop_and_free;
1508                 }
1509
1510                 isn = tcp_v4_init_sequence(sk, skb);
1511         }
1512         tcp_rsk(req)->snt_isn = isn;
1513
1514         if (tcp_v4_send_synack(sk, req, dst))
1515                 goto drop_and_free;
1516
1517         if (want_cookie) {
1518                 reqsk_free(req);
1519         } else {
1520                 tcp_v4_synq_add(sk, req);
1521         }
1522         return 0;
1523
1524 drop_and_free:
1525         reqsk_free(req);
1526 drop:
1527         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1528         return 0;
1529 }
1530
1531
1532 /*
1533  * The three way handshake has completed - we got a valid synack -
1534  * now create the new socket.
1535  */
1536 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1537                                   struct request_sock *req,
1538                                   struct dst_entry *dst)
1539 {
1540         struct inet_request_sock *ireq;
1541         struct inet_sock *newinet;
1542         struct tcp_sock *newtp;
1543         struct sock *newsk;
1544
1545         if (sk_acceptq_is_full(sk))
1546                 goto exit_overflow;
1547
1548         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1549                 goto exit;
1550
1551         newsk = tcp_create_openreq_child(sk, req, skb);
1552         if (!newsk)
1553                 goto exit;
1554
1555         sk_setup_caps(newsk, dst);
1556
1557         newtp                 = tcp_sk(newsk);
1558         newinet               = inet_sk(newsk);
1559         ireq                  = inet_rsk(req);
1560         newinet->daddr        = ireq->rmt_addr;
1561         newinet->rcv_saddr    = ireq->loc_addr;
1562         newinet->saddr        = ireq->loc_addr;
1563         newinet->opt          = ireq->opt;
1564         ireq->opt             = NULL;
1565         newinet->mc_index     = tcp_v4_iif(skb);
1566         newinet->mc_ttl       = skb->nh.iph->ttl;
1567         newtp->ext_header_len = 0;
1568         if (newinet->opt)
1569                 newtp->ext_header_len = newinet->opt->optlen;
1570         newinet->id = newtp->write_seq ^ jiffies;
1571
1572         tcp_sync_mss(newsk, dst_mtu(dst));
1573         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1574         tcp_initialize_rcv_mss(newsk);
1575
1576         __tcp_v4_hash(newsk, 0);
1577         __tcp_inherit_port(sk, newsk);
1578
1579         return newsk;
1580
1581 exit_overflow:
1582         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1583 exit:
1584         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1585         dst_release(dst);
1586         return NULL;
1587 }
1588
1589 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1590 {
1591         struct tcphdr *th = skb->h.th;
1592         struct iphdr *iph = skb->nh.iph;
1593         struct tcp_sock *tp = tcp_sk(sk);
1594         struct sock *nsk;
1595         struct request_sock **prev;
1596         /* Find possible connection requests. */
1597         struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1598                                                      iph->saddr, iph->daddr);
1599         if (req)
1600                 return tcp_check_req(sk, skb, req, prev);
1601
1602         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1603                                           th->source,
1604                                           skb->nh.iph->daddr,
1605                                           ntohs(th->dest),
1606                                           tcp_v4_iif(skb));
1607
1608         if (nsk) {
1609                 if (nsk->sk_state != TCP_TIME_WAIT) {
1610                         bh_lock_sock(nsk);
1611                         return nsk;
1612                 }
1613                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1614                 return NULL;
1615         }
1616
1617 #ifdef CONFIG_SYN_COOKIES
1618         if (!th->rst && !th->syn && th->ack)
1619                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1620 #endif
1621         return sk;
1622 }
1623
1624 static int tcp_v4_checksum_init(struct sk_buff *skb)
1625 {
1626         if (skb->ip_summed == CHECKSUM_HW) {
1627                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1628                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1629                                   skb->nh.iph->daddr, skb->csum))
1630                         return 0;
1631
1632                 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1633                 skb->ip_summed = CHECKSUM_NONE;
1634         }
1635         if (skb->len <= 76) {
1636                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1637                                  skb->nh.iph->daddr,
1638                                  skb_checksum(skb, 0, skb->len, 0)))
1639                         return -1;
1640                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1641         } else {
1642                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1643                                           skb->nh.iph->saddr,
1644                                           skb->nh.iph->daddr, 0);
1645         }
1646         return 0;
1647 }
1648
1649
1650 /* The socket must have it's spinlock held when we get
1651  * here.
1652  *
1653  * We have a potential double-lock case here, so even when
1654  * doing backlog processing we use the BH locking scheme.
1655  * This is because we cannot sleep with the original spinlock
1656  * held.
1657  */
1658 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1659 {
1660         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1661                 TCP_CHECK_TIMER(sk);
1662                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1663                         goto reset;
1664                 TCP_CHECK_TIMER(sk);
1665                 return 0;
1666         }
1667
1668         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1669                 goto csum_err;
1670
1671         if (sk->sk_state == TCP_LISTEN) {
1672                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1673                 if (!nsk)
1674                         goto discard;
1675
1676                 if (nsk != sk) {
1677                         if (tcp_child_process(sk, nsk, skb))
1678                                 goto reset;
1679                         return 0;
1680                 }
1681         }
1682
1683         TCP_CHECK_TIMER(sk);
1684         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1685                 goto reset;
1686         TCP_CHECK_TIMER(sk);
1687         return 0;
1688
1689 reset:
1690         tcp_v4_send_reset(skb);
1691 discard:
1692         kfree_skb(skb);
1693         /* Be careful here. If this function gets more complicated and
1694          * gcc suffers from register pressure on the x86, sk (in %ebx)
1695          * might be destroyed here. This current version compiles correctly,
1696          * but you have been warned.
1697          */
1698         return 0;
1699
1700 csum_err:
1701         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1702         goto discard;
1703 }
1704
1705 /*
1706  *      From tcp_input.c
1707  */
1708
1709 int tcp_v4_rcv(struct sk_buff *skb)
1710 {
1711         struct tcphdr *th;
1712         struct sock *sk;
1713         int ret;
1714
1715         if (skb->pkt_type != PACKET_HOST)
1716                 goto discard_it;
1717
1718         /* Count it even if it's bad */
1719         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1720
1721         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1722                 goto discard_it;
1723
1724         th = skb->h.th;
1725
1726         if (th->doff < sizeof(struct tcphdr) / 4)
1727                 goto bad_packet;
1728         if (!pskb_may_pull(skb, th->doff * 4))
1729                 goto discard_it;
1730
1731         /* An explanation is required here, I think.
1732          * Packet length and doff are validated by header prediction,
1733          * provided case of th->doff==0 is elimineted.
1734          * So, we defer the checks. */
1735         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1736              tcp_v4_checksum_init(skb) < 0))
1737                 goto bad_packet;
1738
1739         th = skb->h.th;
1740         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1741         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1742                                     skb->len - th->doff * 4);
1743         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1744         TCP_SKB_CB(skb)->when    = 0;
1745         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1746         TCP_SKB_CB(skb)->sacked  = 0;
1747
1748         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1749                              skb->nh.iph->daddr, ntohs(th->dest),
1750                              tcp_v4_iif(skb));
1751
1752         if (!sk)
1753                 goto no_tcp_socket;
1754
1755 process:
1756         if (sk->sk_state == TCP_TIME_WAIT)
1757                 goto do_time_wait;
1758
1759         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1760                 goto discard_and_relse;
1761
1762         if (sk_filter(sk, skb, 0))
1763                 goto discard_and_relse;
1764
1765         skb->dev = NULL;
1766
1767         bh_lock_sock(sk);
1768         ret = 0;
1769         if (!sock_owned_by_user(sk)) {
1770                 if (!tcp_prequeue(sk, skb))
1771                         ret = tcp_v4_do_rcv(sk, skb);
1772         } else
1773                 sk_add_backlog(sk, skb);
1774         bh_unlock_sock(sk);
1775
1776         sock_put(sk);
1777
1778         return ret;
1779
1780 no_tcp_socket:
1781         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1782                 goto discard_it;
1783
1784         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1785 bad_packet:
1786                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1787         } else {
1788                 tcp_v4_send_reset(skb);
1789         }
1790
1791 discard_it:
1792         /* Discard frame. */
1793         kfree_skb(skb);
1794         return 0;
1795
1796 discard_and_relse:
1797         sock_put(sk);
1798         goto discard_it;
1799
1800 do_time_wait:
1801         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1802                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1803                 goto discard_it;
1804         }
1805
1806         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1807                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1808                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1809                 goto discard_it;
1810         }
1811         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1812                                            skb, th, skb->len)) {
1813         case TCP_TW_SYN: {
1814                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1815                                                           ntohs(th->dest),
1816                                                           tcp_v4_iif(skb));
1817                 if (sk2) {
1818                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1819                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1820                         sk = sk2;
1821                         goto process;
1822                 }
1823                 /* Fall through to ACK */
1824         }
1825         case TCP_TW_ACK:
1826                 tcp_v4_timewait_ack(sk, skb);
1827                 break;
1828         case TCP_TW_RST:
1829                 goto no_tcp_socket;
1830         case TCP_TW_SUCCESS:;
1831         }
1832         goto discard_it;
1833 }
1834
1835 static int tcp_v4_reselect_saddr(struct sock *sk)
1836 {
1837         struct inet_sock *inet = inet_sk(sk);
1838         int err;
1839         struct rtable *rt;
1840         __u32 old_saddr = inet->saddr;
1841         __u32 new_saddr;
1842         __u32 daddr = inet->daddr;
1843
1844         if (inet->opt && inet->opt->srr)
1845                 daddr = inet->opt->faddr;
1846
1847         /* Query new route. */
1848         err = ip_route_connect(&rt, daddr, 0,
1849                                RT_CONN_FLAGS(sk),
1850                                sk->sk_bound_dev_if,
1851                                IPPROTO_TCP,
1852                                inet->sport, inet->dport, sk);
1853         if (err)
1854                 return err;
1855
1856         sk_setup_caps(sk, &rt->u.dst);
1857
1858         new_saddr = rt->rt_src;
1859
1860         if (new_saddr == old_saddr)
1861                 return 0;
1862
1863         if (sysctl_ip_dynaddr > 1) {
1864                 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1865                                  "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1866                        NIPQUAD(old_saddr),
1867                        NIPQUAD(new_saddr));
1868         }
1869
1870         inet->saddr = new_saddr;
1871         inet->rcv_saddr = new_saddr;
1872
1873         /* XXX The only one ugly spot where we need to
1874          * XXX really change the sockets identity after
1875          * XXX it has entered the hashes. -DaveM
1876          *
1877          * Besides that, it does not check for connection
1878          * uniqueness. Wait for troubles.
1879          */
1880         __sk_prot_rehash(sk);
1881         return 0;
1882 }
1883
1884 int tcp_v4_rebuild_header(struct sock *sk)
1885 {
1886         struct inet_sock *inet = inet_sk(sk);
1887         struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1888         u32 daddr;
1889         int err;
1890
1891         /* Route is OK, nothing to do. */
1892         if (rt)
1893                 return 0;
1894
1895         /* Reroute. */
1896         daddr = inet->daddr;
1897         if (inet->opt && inet->opt->srr)
1898                 daddr = inet->opt->faddr;
1899
1900         {
1901                 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1902                                     .nl_u = { .ip4_u =
1903                                               { .daddr = daddr,
1904                                                 .saddr = inet->saddr,
1905                                                 .tos = RT_CONN_FLAGS(sk) } },
1906                                     .proto = IPPROTO_TCP,
1907                                     .uli_u = { .ports =
1908                                                { .sport = inet->sport,
1909                                                  .dport = inet->dport } } };
1910
1911                 err = ip_route_output_flow(&rt, &fl, sk, 0);
1912         }
1913         if (!err) {
1914                 sk_setup_caps(sk, &rt->u.dst);
1915                 return 0;
1916         }
1917
1918         /* Routing failed... */
1919         sk->sk_route_caps = 0;
1920
1921         if (!sysctl_ip_dynaddr ||
1922             sk->sk_state != TCP_SYN_SENT ||
1923             (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1924             (err = tcp_v4_reselect_saddr(sk)) != 0)
1925                 sk->sk_err_soft = -err;
1926
1927         return err;
1928 }
1929
1930 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1931 {
1932         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1933         struct inet_sock *inet = inet_sk(sk);
1934
1935         sin->sin_family         = AF_INET;
1936         sin->sin_addr.s_addr    = inet->daddr;
1937         sin->sin_port           = inet->dport;
1938 }
1939
1940 /* VJ's idea. Save last timestamp seen from this destination
1941  * and hold it at least for normal timewait interval to use for duplicate
1942  * segment detection in subsequent connections, before they enter synchronized
1943  * state.
1944  */
1945
1946 int tcp_v4_remember_stamp(struct sock *sk)
1947 {
1948         struct inet_sock *inet = inet_sk(sk);
1949         struct tcp_sock *tp = tcp_sk(sk);
1950         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1951         struct inet_peer *peer = NULL;
1952         int release_it = 0;
1953
1954         if (!rt || rt->rt_dst != inet->daddr) {
1955                 peer = inet_getpeer(inet->daddr, 1);
1956                 release_it = 1;
1957         } else {
1958                 if (!rt->peer)
1959                         rt_bind_peer(rt, 1);
1960                 peer = rt->peer;
1961         }
1962
1963         if (peer) {
1964                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1965                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1966                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1967                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1968                         peer->tcp_ts = tp->rx_opt.ts_recent;
1969                 }
1970                 if (release_it)
1971                         inet_putpeer(peer);
1972                 return 1;
1973         }
1974
1975         return 0;
1976 }
1977
1978 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1979 {
1980         struct inet_peer *peer = NULL;
1981
1982         peer = inet_getpeer(tw->tw_daddr, 1);
1983
1984         if (peer) {
1985                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1986                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1987                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1988                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1989                         peer->tcp_ts = tw->tw_ts_recent;
1990                 }
1991                 inet_putpeer(peer);
1992                 return 1;
1993         }
1994
1995         return 0;
1996 }
1997
1998 struct tcp_func ipv4_specific = {
1999         .queue_xmit     =       ip_queue_xmit,
2000         .send_check     =       tcp_v4_send_check,
2001         .rebuild_header =       tcp_v4_rebuild_header,
2002         .conn_request   =       tcp_v4_conn_request,
2003         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
2004         .remember_stamp =       tcp_v4_remember_stamp,
2005         .net_header_len =       sizeof(struct iphdr),
2006         .setsockopt     =       ip_setsockopt,
2007         .getsockopt     =       ip_getsockopt,
2008         .addr2sockaddr  =       v4_addr2sockaddr,
2009         .sockaddr_len   =       sizeof(struct sockaddr_in),
2010 };
2011
2012 /* NOTE: A lot of things set to zero explicitly by call to
2013  *       sk_alloc() so need not be done here.
2014  */
2015 static int tcp_v4_init_sock(struct sock *sk)
2016 {
2017         struct tcp_sock *tp = tcp_sk(sk);
2018
2019         skb_queue_head_init(&tp->out_of_order_queue);
2020         tcp_init_xmit_timers(sk);
2021         tcp_prequeue_init(tp);
2022
2023         tp->rto  = TCP_TIMEOUT_INIT;
2024         tp->mdev = TCP_TIMEOUT_INIT;
2025
2026         /* So many TCP implementations out there (incorrectly) count the
2027          * initial SYN frame in their delayed-ACK and congestion control
2028          * algorithms that we must have the following bandaid to talk
2029          * efficiently to them.  -DaveM
2030          */
2031         tp->snd_cwnd = 2;
2032
2033         /* See draft-stevens-tcpca-spec-01 for discussion of the
2034          * initialization of these values.
2035          */
2036         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
2037         tp->snd_cwnd_clamp = ~0;
2038         tp->mss_cache = 536;
2039
2040         tp->reordering = sysctl_tcp_reordering;
2041         tp->ca_ops = &tcp_init_congestion_ops;
2042
2043         sk->sk_state = TCP_CLOSE;
2044
2045         sk->sk_write_space = sk_stream_write_space;
2046         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2047
2048         tp->af_specific = &ipv4_specific;
2049
2050         sk->sk_sndbuf = sysctl_tcp_wmem[1];
2051         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2052
2053         atomic_inc(&tcp_sockets_allocated);
2054
2055         return 0;
2056 }
2057
2058 int tcp_v4_destroy_sock(struct sock *sk)
2059 {
2060         struct tcp_sock *tp = tcp_sk(sk);
2061
2062         tcp_clear_xmit_timers(sk);
2063
2064         tcp_cleanup_congestion_control(tp);
2065
2066         /* Cleanup up the write buffer. */
2067         sk_stream_writequeue_purge(sk);
2068
2069         /* Cleans up our, hopefully empty, out_of_order_queue. */
2070         __skb_queue_purge(&tp->out_of_order_queue);
2071
2072         /* Clean prequeue, it must be empty really */
2073         __skb_queue_purge(&tp->ucopy.prequeue);
2074
2075         /* Clean up a referenced TCP bind bucket. */
2076         if (tp->bind_hash)
2077                 tcp_put_port(sk);
2078
2079         /*
2080          * If sendmsg cached page exists, toss it.
2081          */
2082         if (sk->sk_sndmsg_page) {
2083                 __free_page(sk->sk_sndmsg_page);
2084                 sk->sk_sndmsg_page = NULL;
2085         }
2086
2087         atomic_dec(&tcp_sockets_allocated);
2088
2089         return 0;
2090 }
2091
2092 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2093
2094 #ifdef CONFIG_PROC_FS
2095 /* Proc filesystem TCP sock list dumping. */
2096
2097 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2098 {
2099         return hlist_empty(head) ? NULL :
2100                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2101 }
2102
2103 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2104 {
2105         return tw->tw_node.next ?
2106                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2107 }
2108
2109 static void *listening_get_next(struct seq_file *seq, void *cur)
2110 {
2111         struct tcp_sock *tp;
2112         struct hlist_node *node;
2113         struct sock *sk = cur;
2114         struct tcp_iter_state* st = seq->private;
2115
2116         if (!sk) {
2117                 st->bucket = 0;
2118                 sk = sk_head(&tcp_listening_hash[0]);
2119                 goto get_sk;
2120         }
2121
2122         ++st->num;
2123
2124         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2125                 struct request_sock *req = cur;
2126
2127                 tp = tcp_sk(st->syn_wait_sk);
2128                 req = req->dl_next;
2129                 while (1) {
2130                         while (req) {
2131                                 if (req->rsk_ops->family == st->family) {
2132                                         cur = req;
2133                                         goto out;
2134                                 }
2135                                 req = req->dl_next;
2136                         }
2137                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
2138                                 break;
2139 get_req:
2140                         req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
2141                 }
2142                 sk        = sk_next(st->syn_wait_sk);
2143                 st->state = TCP_SEQ_STATE_LISTENING;
2144                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2145         } else {
2146                 tp = tcp_sk(sk);
2147                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2148                 if (reqsk_queue_len(&tp->accept_queue))
2149                         goto start_req;
2150                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2151                 sk = sk_next(sk);
2152         }
2153 get_sk:
2154         sk_for_each_from(sk, node) {
2155                 if (sk->sk_family == st->family) {
2156                         cur = sk;
2157                         goto out;
2158                 }
2159                 tp = tcp_sk(sk);
2160                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2161                 if (reqsk_queue_len(&tp->accept_queue)) {
2162 start_req:
2163                         st->uid         = sock_i_uid(sk);
2164                         st->syn_wait_sk = sk;
2165                         st->state       = TCP_SEQ_STATE_OPENREQ;
2166                         st->sbucket     = 0;
2167                         goto get_req;
2168                 }
2169                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2170         }
2171         if (++st->bucket < TCP_LHTABLE_SIZE) {
2172                 sk = sk_head(&tcp_listening_hash[st->bucket]);
2173                 goto get_sk;
2174         }
2175         cur = NULL;
2176 out:
2177         return cur;
2178 }
2179
2180 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2181 {
2182         void *rc = listening_get_next(seq, NULL);
2183
2184         while (rc && *pos) {
2185                 rc = listening_get_next(seq, rc);
2186                 --*pos;
2187         }
2188         return rc;
2189 }
2190
2191 static void *established_get_first(struct seq_file *seq)
2192 {
2193         struct tcp_iter_state* st = seq->private;
2194         void *rc = NULL;
2195
2196         for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2197                 struct sock *sk;
2198                 struct hlist_node *node;
2199                 struct tcp_tw_bucket *tw;
2200
2201                 /* We can reschedule _before_ having picked the target: */
2202                 cond_resched_softirq();
2203
2204                 read_lock(&tcp_ehash[st->bucket].lock);
2205                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2206                         if (sk->sk_family != st->family) {
2207                                 continue;
2208                         }
2209                         rc = sk;
2210                         goto out;
2211                 }
2212                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2213                 tw_for_each(tw, node,
2214                             &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2215                         if (tw->tw_family != st->family) {
2216                                 continue;
2217                         }
2218                         rc = tw;
2219                         goto out;
2220                 }
2221                 read_unlock(&tcp_ehash[st->bucket].lock);
2222                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2223         }
2224 out:
2225         return rc;
2226 }
2227
2228 static void *established_get_next(struct seq_file *seq, void *cur)
2229 {
2230         struct sock *sk = cur;
2231         struct tcp_tw_bucket *tw;
2232         struct hlist_node *node;
2233         struct tcp_iter_state* st = seq->private;
2234
2235         ++st->num;
2236
2237         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2238                 tw = cur;
2239                 tw = tw_next(tw);
2240 get_tw:
2241                 while (tw && tw->tw_family != st->family) {
2242                         tw = tw_next(tw);
2243                 }
2244                 if (tw) {
2245                         cur = tw;
2246                         goto out;
2247                 }
2248                 read_unlock(&tcp_ehash[st->bucket].lock);
2249                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2250
2251                 /* We can reschedule between buckets: */
2252                 cond_resched_softirq();
2253
2254                 if (++st->bucket < tcp_ehash_size) {
2255                         read_lock(&tcp_ehash[st->bucket].lock);
2256                         sk = sk_head(&tcp_ehash[st->bucket].chain);
2257                 } else {
2258                         cur = NULL;
2259                         goto out;
2260                 }
2261         } else
2262                 sk = sk_next(sk);
2263
2264         sk_for_each_from(sk, node) {
2265                 if (sk->sk_family == st->family)
2266                         goto found;
2267         }
2268
2269         st->state = TCP_SEQ_STATE_TIME_WAIT;
2270         tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2271         goto get_tw;
2272 found:
2273         cur = sk;
2274 out:
2275         return cur;
2276 }
2277
2278 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2279 {
2280         void *rc = established_get_first(seq);
2281
2282         while (rc && pos) {
2283                 rc = established_get_next(seq, rc);
2284                 --pos;
2285         }
2286         return rc;
2287 }
2288
2289 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2290 {
2291         void *rc;
2292         struct tcp_iter_state* st = seq->private;
2293
2294         tcp_listen_lock();
2295         st->state = TCP_SEQ_STATE_LISTENING;
2296         rc        = listening_get_idx(seq, &pos);
2297
2298         if (!rc) {
2299                 tcp_listen_unlock();
2300                 local_bh_disable();
2301                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2302                 rc        = established_get_idx(seq, pos);
2303         }
2304
2305         return rc;
2306 }
2307
2308 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2309 {
2310         struct tcp_iter_state* st = seq->private;
2311         st->state = TCP_SEQ_STATE_LISTENING;
2312         st->num = 0;
2313         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2314 }
2315
2316 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2317 {
2318         void *rc = NULL;
2319         struct tcp_iter_state* st;
2320
2321         if (v == SEQ_START_TOKEN) {
2322                 rc = tcp_get_idx(seq, 0);
2323                 goto out;
2324         }
2325         st = seq->private;
2326
2327         switch (st->state) {
2328         case TCP_SEQ_STATE_OPENREQ:
2329         case TCP_SEQ_STATE_LISTENING:
2330                 rc = listening_get_next(seq, v);
2331                 if (!rc) {
2332                         tcp_listen_unlock();
2333                         local_bh_disable();
2334                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2335                         rc        = established_get_first(seq);
2336                 }
2337                 break;
2338         case TCP_SEQ_STATE_ESTABLISHED:
2339         case TCP_SEQ_STATE_TIME_WAIT:
2340                 rc = established_get_next(seq, v);
2341                 break;
2342         }
2343 out:
2344         ++*pos;
2345         return rc;
2346 }
2347
2348 static void tcp_seq_stop(struct seq_file *seq, void *v)
2349 {
2350         struct tcp_iter_state* st = seq->private;
2351
2352         switch (st->state) {
2353         case TCP_SEQ_STATE_OPENREQ:
2354                 if (v) {
2355                         struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2356                         read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2357                 }
2358         case TCP_SEQ_STATE_LISTENING:
2359                 if (v != SEQ_START_TOKEN)
2360                         tcp_listen_unlock();
2361                 break;
2362         case TCP_SEQ_STATE_TIME_WAIT:
2363         case TCP_SEQ_STATE_ESTABLISHED:
2364                 if (v)
2365                         read_unlock(&tcp_ehash[st->bucket].lock);
2366                 local_bh_enable();
2367                 break;
2368         }
2369 }
2370
2371 static int tcp_seq_open(struct inode *inode, struct file *file)
2372 {
2373         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2374         struct seq_file *seq;
2375         struct tcp_iter_state *s;
2376         int rc;
2377
2378         if (unlikely(afinfo == NULL))
2379                 return -EINVAL;
2380
2381         s = kmalloc(sizeof(*s), GFP_KERNEL);
2382         if (!s)
2383                 return -ENOMEM;
2384         memset(s, 0, sizeof(*s));
2385         s->family               = afinfo->family;
2386         s->seq_ops.start        = tcp_seq_start;
2387         s->seq_ops.next         = tcp_seq_next;
2388         s->seq_ops.show         = afinfo->seq_show;
2389         s->seq_ops.stop         = tcp_seq_stop;
2390
2391         rc = seq_open(file, &s->seq_ops);
2392         if (rc)
2393                 goto out_kfree;
2394         seq          = file->private_data;
2395         seq->private = s;
2396 out:
2397         return rc;
2398 out_kfree:
2399         kfree(s);
2400         goto out;
2401 }
2402
2403 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2404 {
2405         int rc = 0;
2406         struct proc_dir_entry *p;
2407
2408         if (!afinfo)
2409                 return -EINVAL;
2410         afinfo->seq_fops->owner         = afinfo->owner;
2411         afinfo->seq_fops->open          = tcp_seq_open;
2412         afinfo->seq_fops->read          = seq_read;
2413         afinfo->seq_fops->llseek        = seq_lseek;
2414         afinfo->seq_fops->release       = seq_release_private;
2415
2416         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2417         if (p)
2418                 p->data = afinfo;
2419         else
2420                 rc = -ENOMEM;
2421         return rc;
2422 }
2423
2424 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2425 {
2426         if (!afinfo)
2427                 return;
2428         proc_net_remove(afinfo->name);
2429         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2430 }
2431
2432 static void get_openreq4(struct sock *sk, struct request_sock *req,
2433                          char *tmpbuf, int i, int uid)
2434 {
2435         const struct inet_request_sock *ireq = inet_rsk(req);
2436         int ttd = req->expires - jiffies;
2437
2438         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2439                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2440                 i,
2441                 ireq->loc_addr,
2442                 ntohs(inet_sk(sk)->sport),
2443                 ireq->rmt_addr,
2444                 ntohs(ireq->rmt_port),
2445                 TCP_SYN_RECV,
2446                 0, 0, /* could print option size, but that is af dependent. */
2447                 1,    /* timers active (only the expire timer) */
2448                 jiffies_to_clock_t(ttd),
2449                 req->retrans,
2450                 uid,
2451                 0,  /* non standard timer */
2452                 0, /* open_requests have no inode */
2453                 atomic_read(&sk->sk_refcnt),
2454                 req);
2455 }
2456
2457 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2458 {
2459         int timer_active;
2460         unsigned long timer_expires;
2461         struct tcp_sock *tp = tcp_sk(sp);
2462         struct inet_sock *inet = inet_sk(sp);
2463         unsigned int dest = inet->daddr;
2464         unsigned int src = inet->rcv_saddr;
2465         __u16 destp = ntohs(inet->dport);
2466         __u16 srcp = ntohs(inet->sport);
2467
2468         if (tp->pending == TCP_TIME_RETRANS) {
2469                 timer_active    = 1;
2470                 timer_expires   = tp->timeout;
2471         } else if (tp->pending == TCP_TIME_PROBE0) {
2472                 timer_active    = 4;
2473                 timer_expires   = tp->timeout;
2474         } else if (timer_pending(&sp->sk_timer)) {
2475                 timer_active    = 2;
2476                 timer_expires   = sp->sk_timer.expires;
2477         } else {
2478                 timer_active    = 0;
2479                 timer_expires = jiffies;
2480         }
2481
2482         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2483                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2484                 i, src, srcp, dest, destp, sp->sk_state,
2485                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2486                 timer_active,
2487                 jiffies_to_clock_t(timer_expires - jiffies),
2488                 tp->retransmits,
2489                 sock_i_uid(sp),
2490                 tp->probes_out,
2491                 sock_i_ino(sp),
2492                 atomic_read(&sp->sk_refcnt), sp,
2493                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2494                 tp->snd_cwnd,
2495                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2496 }
2497
2498 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2499 {
2500         unsigned int dest, src;
2501         __u16 destp, srcp;
2502         int ttd = tw->tw_ttd - jiffies;
2503
2504         if (ttd < 0)
2505                 ttd = 0;
2506
2507         dest  = tw->tw_daddr;
2508         src   = tw->tw_rcv_saddr;
2509         destp = ntohs(tw->tw_dport);
2510         srcp  = ntohs(tw->tw_sport);
2511
2512         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2513                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2514                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2515                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2516                 atomic_read(&tw->tw_refcnt), tw);
2517 }
2518
2519 #define TMPSZ 150
2520
2521 static int tcp4_seq_show(struct seq_file *seq, void *v)
2522 {
2523         struct tcp_iter_state* st;
2524         char tmpbuf[TMPSZ + 1];
2525
2526         if (v == SEQ_START_TOKEN) {
2527                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2528                            "  sl  local_address rem_address   st tx_queue "
2529                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2530                            "inode");
2531                 goto out;
2532         }
2533         st = seq->private;
2534
2535         switch (st->state) {
2536         case TCP_SEQ_STATE_LISTENING:
2537         case TCP_SEQ_STATE_ESTABLISHED:
2538                 get_tcp4_sock(v, tmpbuf, st->num);
2539                 break;
2540         case TCP_SEQ_STATE_OPENREQ:
2541                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2542                 break;
2543         case TCP_SEQ_STATE_TIME_WAIT:
2544                 get_timewait4_sock(v, tmpbuf, st->num);
2545                 break;
2546         }
2547         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2548 out:
2549         return 0;
2550 }
2551
2552 static struct file_operations tcp4_seq_fops;
2553 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2554         .owner          = THIS_MODULE,
2555         .name           = "tcp",
2556         .family         = AF_INET,
2557         .seq_show       = tcp4_seq_show,
2558         .seq_fops       = &tcp4_seq_fops,
2559 };
2560
2561 int __init tcp4_proc_init(void)
2562 {
2563         return tcp_proc_register(&tcp4_seq_afinfo);
2564 }
2565
2566 void tcp4_proc_exit(void)
2567 {
2568         tcp_proc_unregister(&tcp4_seq_afinfo);
2569 }
2570 #endif /* CONFIG_PROC_FS */
2571
2572 struct proto tcp_prot = {
2573         .name                   = "TCP",
2574         .owner                  = THIS_MODULE,
2575         .close                  = tcp_close,
2576         .connect                = tcp_v4_connect,
2577         .disconnect             = tcp_disconnect,
2578         .accept                 = tcp_accept,
2579         .ioctl                  = tcp_ioctl,
2580         .init                   = tcp_v4_init_sock,
2581         .destroy                = tcp_v4_destroy_sock,
2582         .shutdown               = tcp_shutdown,
2583         .setsockopt             = tcp_setsockopt,
2584         .getsockopt             = tcp_getsockopt,
2585         .sendmsg                = tcp_sendmsg,
2586         .recvmsg                = tcp_recvmsg,
2587         .backlog_rcv            = tcp_v4_do_rcv,
2588         .hash                   = tcp_v4_hash,
2589         .unhash                 = tcp_unhash,
2590         .get_port               = tcp_v4_get_port,
2591         .enter_memory_pressure  = tcp_enter_memory_pressure,
2592         .sockets_allocated      = &tcp_sockets_allocated,
2593         .memory_allocated       = &tcp_memory_allocated,
2594         .memory_pressure        = &tcp_memory_pressure,
2595         .sysctl_mem             = sysctl_tcp_mem,
2596         .sysctl_wmem            = sysctl_tcp_wmem,
2597         .sysctl_rmem            = sysctl_tcp_rmem,
2598         .max_header             = MAX_TCP_HEADER,
2599         .obj_size               = sizeof(struct tcp_sock),
2600         .rsk_prot               = &tcp_request_sock_ops,
2601 };
2602
2603
2604
2605 void __init tcp_v4_init(struct net_proto_family *ops)
2606 {
2607         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2608         if (err < 0)
2609                 panic("Failed to create the TCP control socket.\n");
2610         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2611         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2612
2613         /* Unhash it so that IP input processing does not even
2614          * see it, we do not wish this socket to see incoming
2615          * packets.
2616          */
2617         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2618 }
2619
2620 EXPORT_SYMBOL(ipv4_specific);
2621 EXPORT_SYMBOL(tcp_bind_hash);
2622 EXPORT_SYMBOL(tcp_bucket_create);
2623 EXPORT_SYMBOL(tcp_hashinfo);
2624 EXPORT_SYMBOL(tcp_inherit_port);
2625 EXPORT_SYMBOL(tcp_listen_wlock);
2626 EXPORT_SYMBOL(tcp_port_rover);
2627 EXPORT_SYMBOL(tcp_prot);
2628 EXPORT_SYMBOL(tcp_put_port);
2629 EXPORT_SYMBOL(tcp_unhash);
2630 EXPORT_SYMBOL(tcp_v4_conn_request);
2631 EXPORT_SYMBOL(tcp_v4_connect);
2632 EXPORT_SYMBOL(tcp_v4_do_rcv);
2633 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2634 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2635 EXPORT_SYMBOL(tcp_v4_send_check);
2636 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2637
2638 #ifdef CONFIG_PROC_FS
2639 EXPORT_SYMBOL(tcp_proc_register);
2640 EXPORT_SYMBOL(tcp_proc_unregister);
2641 #endif
2642 EXPORT_SYMBOL(sysctl_local_port_range);
2643 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2644 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2645