SAFE public projects git trees. - safe/jmp/linux-2.6/blob - net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      request_sock handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen sematics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/ipv6.h>
  70 #include <net/inet_common.h>
  71 #include <net/xfrm.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78
  79 extern int sysctl_ip_dynaddr;
  80 int sysctl_tcp_tw_reuse;
  81 int sysctl_tcp_low_latency;
  82
  83 /* Check TCP sequence numbers in ICMP packets. */
  84 #define ICMP_MIN_LENGTH 8
  85
  86 /* Socket used for sending RSTs */
  87 static struct socket *tcp_socket;
  88
  89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  90                        struct sk_buff *skb);
  91
  92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
  93         .lhash_lock     = RW_LOCK_UNLOCKED,
  94         .lhash_users    = ATOMIC_INIT(0),
  95         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
  96         .portalloc_lock = SPIN_LOCK_UNLOCKED,
  97         .port_rover     = 1024 - 1,
  98 };
  99
 100 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 101 {
 102         return inet_csk_get_port(&tcp_hashinfo, sk, snum);
 103 }
 104
 105 static void tcp_v4_hash(struct sock *sk)
 106 {
 107         inet_hash(&tcp_hashinfo, sk);
 108 }
 109
 110 void tcp_unhash(struct sock *sk)
 111 {
 112         inet_unhash(&tcp_hashinfo, sk);
 113 }
 114
 115 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 116 {
 117         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 118                                           skb->nh.iph->saddr,
 119                                           skb->h.th->dest,
 120                                           skb->h.th->source);
 121 }
 122
 123 /* called with local bh disabled */
 124 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 125                                       struct inet_timewait_sock **twp)
 126 {
 127         struct inet_sock *inet = inet_sk(sk);
 128         u32 daddr = inet->rcv_saddr;
 129         u32 saddr = inet->daddr;
 130         int dif = sk->sk_bound_dev_if;
 131         INET_ADDR_COOKIE(acookie, saddr, daddr)
 132         const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
 133         const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
 134         struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
 135         struct sock *sk2;
 136         const struct hlist_node *node;
 137         struct inet_timewait_sock *tw;
 138
 139         write_lock(&head->lock);
 140
 141         /* Check TIME-WAIT sockets first. */
 142         sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
 143                 tw = inet_twsk(sk2);
 144
 145                 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 146                         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
 147                         struct tcp_sock *tp = tcp_sk(sk);
 148
 149                         /* With PAWS, it is safe from the viewpoint
 150                            of data integrity. Even without PAWS it
 151                            is safe provided sequence spaces do not
 152                            overlap i.e. at data rates <= 80Mbit/sec.
 153
 154                            Actually, the idea is close to VJ's one,
 155                            only timestamp cache is held not per host,
 156                            but per port pair and TW bucket is used
 157                            as state holder.
 158
 159                            If TW bucket has been already destroyed we
 160                            fall back to VJ's scheme and use initial
 161                            timestamp retrieved from peer table.
 162                          */
 163                         if (tcptw->tw_ts_recent_stamp &&
 164                             (!twp || (sysctl_tcp_tw_reuse &&
 165                                       xtime.tv_sec -
 166                                       tcptw->tw_ts_recent_stamp > 1))) {
 167                                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 168                                 if (tp->write_seq == 0)
 169                                         tp->write_seq = 1;
 170                                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 171                                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 172                                 sock_hold(sk2);
 173                                 goto unique;
 174                         } else
 175                                 goto not_unique;
 176                 }
 177         }
 178         tw = NULL;
 179
 180         /* And established part... */
 181         sk_for_each(sk2, node, &head->chain) {
 182                 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 183                         goto not_unique;
 184         }
 185
 186 unique:
 187         /* Must record num and sport now. Otherwise we will see
 188          * in hash table socket with a funny identity. */
 189         inet->num = lport;
 190         inet->sport = htons(lport);
 191         sk->sk_hashent = hash;
 192         BUG_TRAP(sk_unhashed(sk));
 193         __sk_add_node(sk, &head->chain);
 194         sock_prot_inc_use(sk->sk_prot);
 195         write_unlock(&head->lock);
 196
 197         if (twp) {
 198                 *twp = tw;
 199                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 200         } else if (tw) {
 201                 /* Silly. Should hash-dance instead... */
 202                 inet_twsk_deschedule(tw, &tcp_death_row);
 203                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 204
 205                 inet_twsk_put(tw);
 206         }
 207
 208         return 0;
 209
 210 not_unique:
 211         write_unlock(&head->lock);
 212         return -EADDRNOTAVAIL;
 213 }
 214
 215 static inline u32 connect_port_offset(const struct sock *sk)
 216 {
 217         const struct inet_sock *inet = inet_sk(sk);
 218
 219         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
 220                                          inet->dport);
 221 }
 222
 223 /*
 224  * Bind a port for a connect operation and hash it.
 225  */
 226 static inline int tcp_v4_hash_connect(struct sock *sk)
 227 {
 228         const unsigned short snum = inet_sk(sk)->num;
 229         struct inet_bind_hashbucket *head;
 230         struct inet_bind_bucket *tb;
 231         int ret;
 232
 233         if (!snum) {
 234                 int low = sysctl_local_port_range[0];
 235                 int high = sysctl_local_port_range[1];
 236                 int range = high - low;
 237                 int i;
 238                 int port;
 239                 static u32 hint;
 240                 u32 offset = hint + connect_port_offset(sk);
 241                 struct hlist_node *node;
 242                 struct inet_timewait_sock *tw = NULL;
 243
 244                 local_bh_disable();
 245                 for (i = 1; i <= range; i++) {
 246                         port = low + (i + offset) % range;
 247                         head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
 248                         spin_lock(&head->lock);
 249
 250                         /* Does not bother with rcv_saddr checks,
 251                          * because the established check is already
 252                          * unique enough.
 253                          */
 254                         inet_bind_bucket_for_each(tb, node, &head->chain) {
 255                                 if (tb->port == port) {
 256                                         BUG_TRAP(!hlist_empty(&tb->owners));
 257                                         if (tb->fastreuse >= 0)
 258                                                 goto next_port;
 259                                         if (!__tcp_v4_check_established(sk,
 260                                                                         port,
 261                                                                         &tw))
 262                                                 goto ok;
 263                                         goto next_port;
 264                                 }
 265                         }
 266
 267                         tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
 268                         if (!tb) {
 269                                 spin_unlock(&head->lock);
 270                                 break;
 271                         }
 272                         tb->fastreuse = -1;
 273                         goto ok;
 274
 275                 next_port:
 276                         spin_unlock(&head->lock);
 277                 }
 278                 local_bh_enable();
 279
 280                 return -EADDRNOTAVAIL;
 281
 282 ok:
 283                 hint += i;
 284
 285                 /* Head lock still held and bh's disabled */
 286                 inet_bind_hash(sk, tb, port);
 287                 if (sk_unhashed(sk)) {
 288                         inet_sk(sk)->sport = htons(port);
 289                         __inet_hash(&tcp_hashinfo, sk, 0);
 290                 }
 291                 spin_unlock(&head->lock);
 292
 293                 if (tw) {
 294                         inet_twsk_deschedule(tw, &tcp_death_row);;
 295                         inet_twsk_put(tw);
 296                 }
 297
 298                 ret = 0;
 299                 goto out;
 300         }
 301
 302         head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
 303         tb  = inet_csk(sk)->icsk_bind_hash;
 304         spin_lock_bh(&head->lock);
 305         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 306                 __inet_hash(&tcp_hashinfo, sk, 0);
 307                 spin_unlock_bh(&head->lock);
 308                 return 0;
 309         } else {
 310                 spin_unlock(&head->lock);
 311                 /* No definite answer... Walk to established hash table */
 312                 ret = __tcp_v4_check_established(sk, snum, NULL);
 313 out:
 314                 local_bh_enable();
 315                 return ret;
 316         }
 317 }
 318
 319 /* This will initiate an outgoing connection. */
 320 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 321 {
 322         struct inet_sock *inet = inet_sk(sk);
 323         struct tcp_sock *tp = tcp_sk(sk);
 324         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 325         struct rtable *rt;
 326         u32 daddr, nexthop;
 327         int tmp;
 328         int err;
 329
 330         if (addr_len < sizeof(struct sockaddr_in))
 331                 return -EINVAL;
 332
 333         if (usin->sin_family != AF_INET)
 334                 return -EAFNOSUPPORT;
 335
 336         nexthop = daddr = usin->sin_addr.s_addr;
 337         if (inet->opt && inet->opt->srr) {
 338                 if (!daddr)
 339                         return -EINVAL;
 340                 nexthop = inet->opt->faddr;
 341         }
 342
 343         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 344                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 345                                IPPROTO_TCP,
 346                                inet->sport, usin->sin_port, sk);
 347         if (tmp < 0)
 348                 return tmp;
 349
 350         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 351                 ip_rt_put(rt);
 352                 return -ENETUNREACH;
 353         }
 354
 355         if (!inet->opt || !inet->opt->srr)
 356                 daddr = rt->rt_dst;
 357
 358         if (!inet->saddr)
 359                 inet->saddr = rt->rt_src;
 360         inet->rcv_saddr = inet->saddr;
 361
 362         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 363                 /* Reset inherited state */
 364                 tp->rx_opt.ts_recent       = 0;
 365                 tp->rx_opt.ts_recent_stamp = 0;
 366                 tp->write_seq              = 0;
 367         }
 368
 369         if (tcp_death_row.sysctl_tw_recycle &&
 370             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 371                 struct inet_peer *peer = rt_get_peer(rt);
 372
 373                 /* VJ's idea. We save last timestamp seen from
 374                  * the destination in peer table, when entering state TIME-WAIT
 375                  * and initialize rx_opt.ts_recent from it, when trying new connection.
 376                  */
 377
 378                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 379                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 380                         tp->rx_opt.ts_recent = peer->tcp_ts;
 381                 }
 382         }
 383
 384         inet->dport = usin->sin_port;
 385         inet->daddr = daddr;
 386
 387         tp->ext_header_len = 0;
 388         if (inet->opt)
 389                 tp->ext_header_len = inet->opt->optlen;
 390
 391         tp->rx_opt.mss_clamp = 536;
 392
 393         /* Socket identity is still unknown (sport may be zero).
 394          * However we set state to SYN-SENT and not releasing socket
 395          * lock select source port, enter ourselves into the hash tables and
 396          * complete initialization after this.
 397          */
 398         tcp_set_state(sk, TCP_SYN_SENT);
 399         err = tcp_v4_hash_connect(sk);
 400         if (err)
 401                 goto failure;
 402
 403         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 404         if (err)
 405                 goto failure;
 406
 407         /* OK, now commit destination to socket.  */
 408         sk_setup_caps(sk, &rt->u.dst);
 409
 410         if (!tp->write_seq)
 411                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 412                                                            inet->daddr,
 413                                                            inet->sport,
 414                                                            usin->sin_port);
 415
 416         inet->id = tp->write_seq ^ jiffies;
 417
 418         err = tcp_connect(sk);
 419         rt = NULL;
 420         if (err)
 421                 goto failure;
 422
 423         return 0;
 424
 425 failure:
 426         /* This unhashes the socket and releases the local port, if necessary. */
 427         tcp_set_state(sk, TCP_CLOSE);
 428         ip_rt_put(rt);
 429         sk->sk_route_caps = 0;
 430         inet->dport = 0;
 431         return err;
 432 }
 433
 434 /*
 435  * This routine does path mtu discovery as defined in RFC1191.
 436  */
 437 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 438                                      u32 mtu)
 439 {
 440         struct dst_entry *dst;
 441         struct inet_sock *inet = inet_sk(sk);
 442         struct tcp_sock *tp = tcp_sk(sk);
 443
 444         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 445          * send out by Linux are always <576bytes so they should go through
 446          * unfragmented).
 447          */
 448         if (sk->sk_state == TCP_LISTEN)
 449                 return;
 450
 451         /* We don't check in the destentry if pmtu discovery is forbidden
 452          * on this route. We just assume that no packet_to_big packets
 453          * are send back when pmtu discovery is not active.
 454          * There is a small race when the user changes this flag in the
 455          * route, but I think that's acceptable.
 456          */
 457         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 458                 return;
 459
 460         dst->ops->update_pmtu(dst, mtu);
 461
 462         /* Something is about to be wrong... Remember soft error
 463          * for the case, if this connection will not able to recover.
 464          */
 465         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 466                 sk->sk_err_soft = EMSGSIZE;
 467
 468         mtu = dst_mtu(dst);
 469
 470         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 471             tp->pmtu_cookie > mtu) {
 472                 tcp_sync_mss(sk, mtu);
 473
 474                 /* Resend the TCP packet because it's
 475                  * clear that the old packet has been
 476                  * dropped. This is the new "fast" path mtu
 477                  * discovery.
 478                  */
 479                 tcp_simple_retransmit(sk);
 480         } /* else let the usual retransmit timer handle it */
 481 }
 482
 483 /*
 484  * This routine is called by the ICMP module when it gets some
 485  * sort of error condition.  If err < 0 then the socket should
 486  * be closed and the error returned to the user.  If err > 0
 487  * it's just the icmp type << 8 | icmp code.  After adjustment
 488  * header points to the first 8 bytes of the tcp header.  We need
 489  * to find the appropriate port.
 490  *
 491  * The locking strategy used here is very "optimistic". When
 492  * someone else accesses the socket the ICMP is just dropped
 493  * and for some paths there is no check at all.
 494  * A more general error queue to queue errors for later handling
 495  * is probably better.
 496  *
 497  */
 498
 499 void tcp_v4_err(struct sk_buff *skb, u32 info)
 500 {
 501         struct iphdr *iph = (struct iphdr *)skb->data;
 502         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 503         struct tcp_sock *tp;
 504         struct inet_sock *inet;
 505         int type = skb->h.icmph->type;
 506         int code = skb->h.icmph->code;
 507         struct sock *sk;
 508         __u32 seq;
 509         int err;
 510
 511         if (skb->len < (iph->ihl << 2) + 8) {
 512                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 513                 return;
 514         }
 515
 516         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
 517                          th->source, inet_iif(skb));
 518         if (!sk) {
 519                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 520                 return;
 521         }
 522         if (sk->sk_state == TCP_TIME_WAIT) {
 523                 inet_twsk_put((struct inet_timewait_sock *)sk);
 524                 return;
 525         }
 526
 527         bh_lock_sock(sk);
 528         /* If too many ICMPs get dropped on busy
 529          * servers this needs to be solved differently.
 530          */
 531         if (sock_owned_by_user(sk))
 532                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 533
 534         if (sk->sk_state == TCP_CLOSE)
 535                 goto out;
 536
 537         tp = tcp_sk(sk);
 538         seq = ntohl(th->seq);
 539         if (sk->sk_state != TCP_LISTEN &&
 540             !between(seq, tp->snd_una, tp->snd_nxt)) {
 541                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
 542                 goto out;
 543         }
 544
 545         switch (type) {
 546         case ICMP_SOURCE_QUENCH:
 547                 /* Just silently ignore these. */
 548                 goto out;
 549         case ICMP_PARAMETERPROB:
 550                 err = EPROTO;
 551                 break;
 552         case ICMP_DEST_UNREACH:
 553                 if (code > NR_ICMP_UNREACH)
 554                         goto out;
 555
 556                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 557                         if (!sock_owned_by_user(sk))
 558                                 do_pmtu_discovery(sk, iph, info);
 559                         goto out;
 560                 }
 561
 562                 err = icmp_err_convert[code].errno;
 563                 break;
 564         case ICMP_TIME_EXCEEDED:
 565                 err = EHOSTUNREACH;
 566                 break;
 567         default:
 568                 goto out;
 569         }
 570
 571         switch (sk->sk_state) {
 572                 struct request_sock *req, **prev;
 573         case TCP_LISTEN:
 574                 if (sock_owned_by_user(sk))
 575                         goto out;
 576
 577                 req = inet_csk_search_req(sk, &prev, th->dest,
 578                                           iph->daddr, iph->saddr);
 579                 if (!req)
 580                         goto out;
 581
 582                 /* ICMPs are not backlogged, hence we cannot get
 583                    an established socket here.
 584                  */
 585                 BUG_TRAP(!req->sk);
 586
 587                 if (seq != tcp_rsk(req)->snt_isn) {
 588                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 589                         goto out;
 590                 }
 591
 592                 /*
 593                  * Still in SYN_RECV, just remove it silently.
 594                  * There is no good way to pass the error to the newly
 595                  * created socket, and POSIX does not want network
 596                  * errors returned from accept().
 597                  */
 598                 inet_csk_reqsk_queue_drop(sk, req, prev);
 599                 goto out;
 600
 601         case TCP_SYN_SENT:
 602         case TCP_SYN_RECV:  /* Cannot happen.
 603                                It can f.e. if SYNs crossed.
 604                              */
 605                 if (!sock_owned_by_user(sk)) {
 606                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
 607                         sk->sk_err = err;
 608
 609                         sk->sk_error_report(sk);
 610
 611                         tcp_done(sk);
 612                 } else {
 613                         sk->sk_err_soft = err;
 614                 }
 615                 goto out;
 616         }
 617
 618         /* If we've already connected we will keep trying
 619          * until we time out, or the user gives up.
 620          *
 621          * rfc1122 4.2.3.9 allows to consider as hard errors
 622          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 623          * but it is obsoleted by pmtu discovery).
 624          *
 625          * Note, that in modern internet, where routing is unreliable
 626          * and in each dark corner broken firewalls sit, sending random
 627          * errors ordered by their masters even this two messages finally lose
 628          * their original sense (even Linux sends invalid PORT_UNREACHs)
 629          *
 630          * Now we are in compliance with RFCs.
 631          *                                                      --ANK (980905)
 632          */
 633
 634         inet = inet_sk(sk);
 635         if (!sock_owned_by_user(sk) && inet->recverr) {
 636                 sk->sk_err = err;
 637                 sk->sk_error_report(sk);
 638         } else  { /* Only an error on timeout */
 639                 sk->sk_err_soft = err;
 640         }
 641
 642 out:
 643         bh_unlock_sock(sk);
 644         sock_put(sk);
 645 }
 646
 647 /* This routine computes an IPv4 TCP checksum. */
 648 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
 649                        struct sk_buff *skb)
 650 {
 651         struct inet_sock *inet = inet_sk(sk);
 652
 653         if (skb->ip_summed == CHECKSUM_HW) {
 654                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
 655                 skb->csum = offsetof(struct tcphdr, check);
 656         } else {
 657                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
 658                                          csum_partial((char *)th,
 659                                                       th->doff << 2,
 660                                                       skb->csum));
 661         }
 662 }
 663
 664 /*
 665  *      This routine will send an RST to the other tcp.
 666  *
 667  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 668  *                    for reset.
 669  *      Answer: if a packet caused RST, it is not for a socket
 670  *              existing in our system, if it is matched to a socket,
 671  *              it is just duplicate segment or bug in other side's TCP.
 672  *              So that we build reply only basing on parameters
 673  *              arrived with segment.
 674  *      Exception: precedence violation. We do not implement it in any case.
 675  */
 676
 677 static void tcp_v4_send_reset(struct sk_buff *skb)
 678 {
 679         struct tcphdr *th = skb->h.th;
 680         struct tcphdr rth;
 681         struct ip_reply_arg arg;
 682
 683         /* Never send a reset in response to a reset. */
 684         if (th->rst)
 685                 return;
 686
 687         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
 688                 return;
 689
 690         /* Swap the send and the receive. */
 691         memset(&rth, 0, sizeof(struct tcphdr));
 692         rth.dest   = th->source;
 693         rth.source = th->dest;
 694         rth.doff   = sizeof(struct tcphdr) / 4;
 695         rth.rst    = 1;
 696
 697         if (th->ack) {
 698                 rth.seq = th->ack_seq;
 699         } else {
 700                 rth.ack = 1;
 701                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 702                                     skb->len - (th->doff << 2));
 703         }
 704
 705         memset(&arg, 0, sizeof arg);
 706         arg.iov[0].iov_base = (unsigned char *)&rth;
 707         arg.iov[0].iov_len  = sizeof rth;
 708         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
 709                                       skb->nh.iph->saddr, /*XXX*/
 710                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
 711         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 712
 713         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
 714
 715         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 716         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
 717 }
 718
 719 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 720    outside socket context is ugly, certainly. What can I do?
 721  */
 722
 723 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 724                             u32 win, u32 ts)
 725 {
 726         struct tcphdr *th = skb->h.th;
 727         struct {
 728                 struct tcphdr th;
 729                 u32 tsopt[3];
 730         } rep;
 731         struct ip_reply_arg arg;
 732
 733         memset(&rep.th, 0, sizeof(struct tcphdr));
 734         memset(&arg, 0, sizeof arg);
 735
 736         arg.iov[0].iov_base = (unsigned char *)&rep;
 737         arg.iov[0].iov_len  = sizeof(rep.th);
 738         if (ts) {
 739                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 740                                      (TCPOPT_TIMESTAMP << 8) |
 741                                      TCPOLEN_TIMESTAMP);
 742                 rep.tsopt[1] = htonl(tcp_time_stamp);
 743                 rep.tsopt[2] = htonl(ts);
 744                 arg.iov[0].iov_len = sizeof(rep);
 745         }
 746
 747         /* Swap the send and the receive. */
 748         rep.th.dest    = th->source;
 749         rep.th.source  = th->dest;
 750         rep.th.doff    = arg.iov[0].iov_len / 4;
 751         rep.th.seq     = htonl(seq);
 752         rep.th.ack_seq = htonl(ack);
 753         rep.th.ack     = 1;
 754         rep.th.window  = htons(win);
 755
 756         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
 757                                       skb->nh.iph->saddr, /*XXX*/
 758                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 759         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 760
 761         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
 762
 763         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 764 }
 765
 766 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 767 {
 768         struct inet_timewait_sock *tw = inet_twsk(sk);
 769         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 770
 771         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 772                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
 773
 774         inet_twsk_put(tw);
 775 }
 776
 777 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
 778 {
 779         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 780                         req->ts_recent);
 781 }
 782
 783 /*
 784  *      Send a SYN-ACK after having received an ACK.
 785  *      This still operates on a request_sock only, not on a big
 786  *      socket.
 787  */
 788 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 789                               struct dst_entry *dst)
 790 {
 791         const struct inet_request_sock *ireq = inet_rsk(req);
 792         int err = -1;
 793         struct sk_buff * skb;
 794
 795         /* First, grab a route. */
 796         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 797                 goto out;
 798
 799         skb = tcp_make_synack(sk, dst, req);
 800
 801         if (skb) {
 802                 struct tcphdr *th = skb->h.th;
 803
 804                 th->check = tcp_v4_check(th, skb->len,
 805                                          ireq->loc_addr,
 806                                          ireq->rmt_addr,
 807                                          csum_partial((char *)th, skb->len,
 808                                                       skb->csum));
 809
 810                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 811                                             ireq->rmt_addr,
 812                                             ireq->opt);
 813                 if (err == NET_XMIT_CN)
 814                         err = 0;
 815         }
 816
 817 out:
 818         dst_release(dst);
 819         return err;
 820 }
 821
 822 /*
 823  *      IPv4 request_sock destructor.
 824  */
 825 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 826 {
 827         if (inet_rsk(req)->opt)
 828                 kfree(inet_rsk(req)->opt);
 829 }
 830
 831 static inline void syn_flood_warning(struct sk_buff *skb)
 832 {
 833         static unsigned long warntime;
 834
 835         if (time_after(jiffies, (warntime + HZ * 60))) {
 836                 warntime = jiffies;
 837                 printk(KERN_INFO
 838                        "possible SYN flooding on port %d. Sending cookies.\n",
 839                        ntohs(skb->h.th->dest));
 840         }
 841 }
 842
 843 /*
 844  * Save and compile IPv4 options into the request_sock if needed.
 845  */
 846 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
 847                                                      struct sk_buff *skb)
 848 {
 849         struct ip_options *opt = &(IPCB(skb)->opt);
 850         struct ip_options *dopt = NULL;
 851
 852         if (opt && opt->optlen) {
 853                 int opt_size = optlength(opt);
 854                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 855                 if (dopt) {
 856                         if (ip_options_echo(dopt, skb)) {
 857                                 kfree(dopt);
 858                                 dopt = NULL;
 859                         }
 860                 }
 861         }
 862         return dopt;
 863 }
 864
 865 struct request_sock_ops tcp_request_sock_ops = {
 866         .family         =       PF_INET,
 867         .obj_size       =       sizeof(struct tcp_request_sock),
 868         .rtx_syn_ack    =       tcp_v4_send_synack,
 869         .send_ack       =       tcp_v4_reqsk_send_ack,
 870         .destructor     =       tcp_v4_reqsk_destructor,
 871         .send_reset     =       tcp_v4_send_reset,
 872 };
 873
 874 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 875 {
 876         struct inet_request_sock *ireq;
 877         struct tcp_options_received tmp_opt;
 878         struct request_sock *req;
 879         __u32 saddr = skb->nh.iph->saddr;
 880         __u32 daddr = skb->nh.iph->daddr;
 881         __u32 isn = TCP_SKB_CB(skb)->when;
 882         struct dst_entry *dst = NULL;
 883 #ifdef CONFIG_SYN_COOKIES
 884         int want_cookie = 0;
 885 #else
 886 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
 887 #endif
 888
 889         /* Never answer to SYNs send to broadcast or multicast */
 890         if (((struct rtable *)skb->dst)->rt_flags &
 891             (RTCF_BROADCAST | RTCF_MULTICAST))
 892                 goto drop;
 893
 894         /* TW buckets are converted to open requests without
 895          * limitations, they conserve resources and peer is
 896          * evidently real one.
 897          */
 898         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
 899 #ifdef CONFIG_SYN_COOKIES
 900                 if (sysctl_tcp_syncookies) {
 901                         want_cookie = 1;
 902                 } else
 903 #endif
 904                 goto drop;
 905         }
 906
 907         /* Accept backlog is full. If we have already queued enough
 908          * of warm entries in syn queue, drop request. It is better than
 909          * clogging syn queue with openreqs with exponentially increasing
 910          * timeout.
 911          */
 912         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
 913                 goto drop;
 914
 915         req = reqsk_alloc(&tcp_request_sock_ops);
 916         if (!req)
 917                 goto drop;
 918
 919         tcp_clear_options(&tmp_opt);
 920         tmp_opt.mss_clamp = 536;
 921         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
 922
 923         tcp_parse_options(skb, &tmp_opt, 0);
 924
 925         if (want_cookie) {
 926                 tcp_clear_options(&tmp_opt);
 927                 tmp_opt.saw_tstamp = 0;
 928         }
 929
 930         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
 931                 /* Some OSes (unknown ones, but I see them on web server, which
 932                  * contains information interesting only for windows'
 933                  * users) do not send their stamp in SYN. It is easy case.
 934                  * We simply do not advertise TS support.
 935                  */
 936                 tmp_opt.saw_tstamp = 0;
 937                 tmp_opt.tstamp_ok  = 0;
 938         }
 939         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 940
 941         tcp_openreq_init(req, &tmp_opt, skb);
 942
 943         ireq = inet_rsk(req);
 944         ireq->loc_addr = daddr;
 945         ireq->rmt_addr = saddr;
 946         ireq->opt = tcp_v4_save_options(sk, skb);
 947         if (!want_cookie)
 948                 TCP_ECN_create_request(req, skb->h.th);
 949
 950         if (want_cookie) {
 951 #ifdef CONFIG_SYN_COOKIES
 952                 syn_flood_warning(skb);
 953 #endif
 954                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
 955         } else if (!isn) {
 956                 struct inet_peer *peer = NULL;
 957
 958                 /* VJ's idea. We save last timestamp seen
 959                  * from the destination in peer table, when entering
 960                  * state TIME-WAIT, and check against it before
 961                  * accepting new connection request.
 962                  *
 963                  * If "isn" is not zero, this request hit alive
 964                  * timewait bucket, so that all the necessary checks
 965                  * are made in the function processing timewait state.
 966                  */
 967                 if (tmp_opt.saw_tstamp &&
 968                     tcp_death_row.sysctl_tw_recycle &&
 969                     (dst = inet_csk_route_req(sk, req)) != NULL &&
 970                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
 971                     peer->v4daddr == saddr) {
 972                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
 973                             (s32)(peer->tcp_ts - req->ts_recent) >
 974                                                         TCP_PAWS_WINDOW) {
 975                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
 976                                 dst_release(dst);
 977                                 goto drop_and_free;
 978                         }
 979                 }
 980                 /* Kill the following clause, if you dislike this way. */
 981                 else if (!sysctl_tcp_syncookies &&
 982                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 983                           (sysctl_max_syn_backlog >> 2)) &&
 984                          (!peer || !peer->tcp_ts_stamp) &&
 985                          (!dst || !dst_metric(dst, RTAX_RTT))) {
 986                         /* Without syncookies last quarter of
 987                          * backlog is filled with destinations,
 988                          * proven to be alive.
 989                          * It means that we continue to communicate
 990                          * to destinations, already remembered
 991                          * to the moment of synflood.
 992                          */
 993                         LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
 994                                               "request from %u.%u."
 995                                               "%u.%u/%u\n",
 996                                               NIPQUAD(saddr),
 997                                               ntohs(skb->h.th->source)));
 998                         dst_release(dst);
 999                         goto drop_and_free;
1000                 }
1001
1002                 isn = tcp_v4_init_sequence(sk, skb);
1003         }
1004         tcp_rsk(req)->snt_isn = isn;
1005
1006         if (tcp_v4_send_synack(sk, req, dst))
1007                 goto drop_and_free;
1008
1009         if (want_cookie) {
1010                 reqsk_free(req);
1011         } else {
1012                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1013         }
1014         return 0;
1015
1016 drop_and_free:
1017         reqsk_free(req);
1018 drop:
1019         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1020         return 0;
1021 }
1022
1023
1024 /*
1025  * The three way handshake has completed - we got a valid synack -
1026  * now create the new socket.
1027  */
1028 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1029                                   struct request_sock *req,
1030                                   struct dst_entry *dst)
1031 {
1032         struct inet_request_sock *ireq;
1033         struct inet_sock *newinet;
1034         struct tcp_sock *newtp;
1035         struct sock *newsk;
1036
1037         if (sk_acceptq_is_full(sk))
1038                 goto exit_overflow;
1039
1040         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1041                 goto exit;
1042
1043         newsk = tcp_create_openreq_child(sk, req, skb);
1044         if (!newsk)
1045                 goto exit;
1046
1047         sk_setup_caps(newsk, dst);
1048
1049         newtp                 = tcp_sk(newsk);
1050         newinet               = inet_sk(newsk);
1051         ireq                  = inet_rsk(req);
1052         newinet->daddr        = ireq->rmt_addr;
1053         newinet->rcv_saddr    = ireq->loc_addr;
1054         newinet->saddr        = ireq->loc_addr;
1055         newinet->opt          = ireq->opt;
1056         ireq->opt             = NULL;
1057         newinet->mc_index     = inet_iif(skb);
1058         newinet->mc_ttl       = skb->nh.iph->ttl;
1059         newtp->ext_header_len = 0;
1060         if (newinet->opt)
1061                 newtp->ext_header_len = newinet->opt->optlen;
1062         newinet->id = newtp->write_seq ^ jiffies;
1063
1064         tcp_sync_mss(newsk, dst_mtu(dst));
1065         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1066         tcp_initialize_rcv_mss(newsk);
1067
1068         __inet_hash(&tcp_hashinfo, newsk, 0);
1069         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1070
1071         return newsk;
1072
1073 exit_overflow:
1074         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1075 exit:
1076         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1077         dst_release(dst);
1078         return NULL;
1079 }
1080
1081 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1082 {
1083         struct tcphdr *th = skb->h.th;
1084         struct iphdr *iph = skb->nh.iph;
1085         struct sock *nsk;
1086         struct request_sock **prev;
1087         /* Find possible connection requests. */
1088         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1089                                                        iph->saddr, iph->daddr);
1090         if (req)
1091                 return tcp_check_req(sk, skb, req, prev);
1092
1093         nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1094                                         th->source, skb->nh.iph->daddr,
1095                                         ntohs(th->dest), inet_iif(skb));
1096
1097         if (nsk) {
1098                 if (nsk->sk_state != TCP_TIME_WAIT) {
1099                         bh_lock_sock(nsk);
1100                         return nsk;
1101                 }
1102                 inet_twsk_put((struct inet_timewait_sock *)nsk);
1103                 return NULL;
1104         }
1105
1106 #ifdef CONFIG_SYN_COOKIES
1107         if (!th->rst && !th->syn && th->ack)
1108                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1109 #endif
1110         return sk;
1111 }
1112
1113 static int tcp_v4_checksum_init(struct sk_buff *skb)
1114 {
1115         if (skb->ip_summed == CHECKSUM_HW) {
1116                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1117                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1118                                   skb->nh.iph->daddr, skb->csum))
1119                         return 0;
1120
1121                 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1122                 skb->ip_summed = CHECKSUM_NONE;
1123         }
1124         if (skb->len <= 76) {
1125                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1126                                  skb->nh.iph->daddr,
1127                                  skb_checksum(skb, 0, skb->len, 0)))
1128                         return -1;
1129                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1130         } else {
1131                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1132                                           skb->nh.iph->saddr,
1133                                           skb->nh.iph->daddr, 0);
1134         }
1135         return 0;
1136 }
1137
1138
1139 /* The socket must have it's spinlock held when we get
1140  * here.
1141  *
1142  * We have a potential double-lock case here, so even when
1143  * doing backlog processing we use the BH locking scheme.
1144  * This is because we cannot sleep with the original spinlock
1145  * held.
1146  */
1147 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1148 {
1149         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1150                 TCP_CHECK_TIMER(sk);
1151                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1152                         goto reset;
1153                 TCP_CHECK_TIMER(sk);
1154                 return 0;
1155         }
1156
1157         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1158                 goto csum_err;
1159
1160         if (sk->sk_state == TCP_LISTEN) {
1161                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1162                 if (!nsk)
1163                         goto discard;
1164
1165                 if (nsk != sk) {
1166                         if (tcp_child_process(sk, nsk, skb))
1167                                 goto reset;
1168                         return 0;
1169                 }
1170         }
1171
1172         TCP_CHECK_TIMER(sk);
1173         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1174                 goto reset;
1175         TCP_CHECK_TIMER(sk);
1176         return 0;
1177
1178 reset:
1179         tcp_v4_send_reset(skb);
1180 discard:
1181         kfree_skb(skb);
1182         /* Be careful here. If this function gets more complicated and
1183          * gcc suffers from register pressure on the x86, sk (in %ebx)
1184          * might be destroyed here. This current version compiles correctly,
1185          * but you have been warned.
1186          */
1187         return 0;
1188
1189 csum_err:
1190         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1191         goto discard;
1192 }
1193
1194 /*
1195  *      From tcp_input.c
1196  */
1197
1198 int tcp_v4_rcv(struct sk_buff *skb)
1199 {
1200         struct tcphdr *th;
1201         struct sock *sk;
1202         int ret;
1203
1204         if (skb->pkt_type != PACKET_HOST)
1205                 goto discard_it;
1206
1207         /* Count it even if it's bad */
1208         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1209
1210         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1211                 goto discard_it;
1212
1213         th = skb->h.th;
1214
1215         if (th->doff < sizeof(struct tcphdr) / 4)
1216                 goto bad_packet;
1217         if (!pskb_may_pull(skb, th->doff * 4))
1218                 goto discard_it;
1219
1220         /* An explanation is required here, I think.
1221          * Packet length and doff are validated by header prediction,
1222          * provided case of th->doff==0 is elimineted.
1223          * So, we defer the checks. */
1224         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1225              tcp_v4_checksum_init(skb) < 0))
1226                 goto bad_packet;
1227
1228         th = skb->h.th;
1229         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1230         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1231                                     skb->len - th->doff * 4);
1232         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1233         TCP_SKB_CB(skb)->when    = 0;
1234         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1235         TCP_SKB_CB(skb)->sacked  = 0;
1236
1237         sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1238                            skb->nh.iph->daddr, ntohs(th->dest),
1239                            inet_iif(skb));
1240
1241         if (!sk)
1242                 goto no_tcp_socket;
1243
1244 process:
1245         if (sk->sk_state == TCP_TIME_WAIT)
1246                 goto do_time_wait;
1247
1248         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1249                 goto discard_and_relse;
1250
1251         if (sk_filter(sk, skb, 0))
1252                 goto discard_and_relse;
1253
1254         skb->dev = NULL;
1255
1256         bh_lock_sock(sk);
1257         ret = 0;
1258         if (!sock_owned_by_user(sk)) {
1259                 if (!tcp_prequeue(sk, skb))
1260                         ret = tcp_v4_do_rcv(sk, skb);
1261         } else
1262                 sk_add_backlog(sk, skb);
1263         bh_unlock_sock(sk);
1264
1265         sock_put(sk);
1266
1267         return ret;
1268
1269 no_tcp_socket:
1270         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1271                 goto discard_it;
1272
1273         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1274 bad_packet:
1275                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1276         } else {
1277                 tcp_v4_send_reset(skb);
1278         }
1279
1280 discard_it:
1281         /* Discard frame. */
1282         kfree_skb(skb);
1283         return 0;
1284
1285 discard_and_relse:
1286         sock_put(sk);
1287         goto discard_it;
1288
1289 do_time_wait:
1290         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1291                 inet_twsk_put((struct inet_timewait_sock *) sk);
1292                 goto discard_it;
1293         }
1294
1295         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1296                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1297                 inet_twsk_put((struct inet_timewait_sock *) sk);
1298                 goto discard_it;
1299         }
1300         switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1301                                            skb, th)) {
1302         case TCP_TW_SYN: {
1303                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1304                                                         skb->nh.iph->daddr,
1305                                                         ntohs(th->dest),
1306                                                         inet_iif(skb));
1307                 if (sk2) {
1308                         inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1309                                              &tcp_death_row);
1310                         inet_twsk_put((struct inet_timewait_sock *)sk);
1311                         sk = sk2;
1312                         goto process;
1313                 }
1314                 /* Fall through to ACK */
1315         }
1316         case TCP_TW_ACK:
1317                 tcp_v4_timewait_ack(sk, skb);
1318                 break;
1319         case TCP_TW_RST:
1320                 goto no_tcp_socket;
1321         case TCP_TW_SUCCESS:;
1322         }
1323         goto discard_it;
1324 }
1325
1326 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1327 {
1328         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1329         struct inet_sock *inet = inet_sk(sk);
1330
1331         sin->sin_family         = AF_INET;
1332         sin->sin_addr.s_addr    = inet->daddr;
1333         sin->sin_port           = inet->dport;
1334 }
1335
1336 /* VJ's idea. Save last timestamp seen from this destination
1337  * and hold it at least for normal timewait interval to use for duplicate
1338  * segment detection in subsequent connections, before they enter synchronized
1339  * state.
1340  */
1341
1342 int tcp_v4_remember_stamp(struct sock *sk)
1343 {
1344         struct inet_sock *inet = inet_sk(sk);
1345         struct tcp_sock *tp = tcp_sk(sk);
1346         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1347         struct inet_peer *peer = NULL;
1348         int release_it = 0;
1349
1350         if (!rt || rt->rt_dst != inet->daddr) {
1351                 peer = inet_getpeer(inet->daddr, 1);
1352                 release_it = 1;
1353         } else {
1354                 if (!rt->peer)
1355                         rt_bind_peer(rt, 1);
1356                 peer = rt->peer;
1357         }
1358
1359         if (peer) {
1360                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1361                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1362                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1363                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1364                         peer->tcp_ts = tp->rx_opt.ts_recent;
1365                 }
1366                 if (release_it)
1367                         inet_putpeer(peer);
1368                 return 1;
1369         }
1370
1371         return 0;
1372 }
1373
1374 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1375 {
1376         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1377
1378         if (peer) {
1379                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1380
1381                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1382                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1383                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1384                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1385                         peer->tcp_ts       = tcptw->tw_ts_recent;
1386                 }
1387                 inet_putpeer(peer);
1388                 return 1;
1389         }
1390
1391         return 0;
1392 }
1393
1394 struct tcp_func ipv4_specific = {
1395         .queue_xmit     =       ip_queue_xmit,
1396         .send_check     =       tcp_v4_send_check,
1397         .rebuild_header =       inet_sk_rebuild_header,
1398         .conn_request   =       tcp_v4_conn_request,
1399         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1400         .remember_stamp =       tcp_v4_remember_stamp,
1401         .net_header_len =       sizeof(struct iphdr),
1402         .setsockopt     =       ip_setsockopt,
1403         .getsockopt     =       ip_getsockopt,
1404         .addr2sockaddr  =       v4_addr2sockaddr,
1405         .sockaddr_len   =       sizeof(struct sockaddr_in),
1406 };
1407
1408 /* NOTE: A lot of things set to zero explicitly by call to
1409  *       sk_alloc() so need not be done here.
1410  */
1411 static int tcp_v4_init_sock(struct sock *sk)
1412 {
1413         struct tcp_sock *tp = tcp_sk(sk);
1414
1415         skb_queue_head_init(&tp->out_of_order_queue);
1416         tcp_init_xmit_timers(sk);
1417         tcp_prequeue_init(tp);
1418
1419         inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
1420         tp->mdev = TCP_TIMEOUT_INIT;
1421
1422         /* So many TCP implementations out there (incorrectly) count the
1423          * initial SYN frame in their delayed-ACK and congestion control
1424          * algorithms that we must have the following bandaid to talk
1425          * efficiently to them.  -DaveM
1426          */
1427         tp->snd_cwnd = 2;
1428
1429         /* See draft-stevens-tcpca-spec-01 for discussion of the
1430          * initialization of these values.
1431          */
1432         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1433         tp->snd_cwnd_clamp = ~0;
1434         tp->mss_cache = 536;
1435
1436         tp->reordering = sysctl_tcp_reordering;
1437         tp->ca_ops = &tcp_init_congestion_ops;
1438
1439         sk->sk_state = TCP_CLOSE;
1440
1441         sk->sk_write_space = sk_stream_write_space;
1442         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1443
1444         tp->af_specific = &ipv4_specific;
1445
1446         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1447         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1448
1449         atomic_inc(&tcp_sockets_allocated);
1450
1451         return 0;
1452 }
1453
1454 int tcp_v4_destroy_sock(struct sock *sk)
1455 {
1456         struct tcp_sock *tp = tcp_sk(sk);
1457
1458         tcp_clear_xmit_timers(sk);
1459
1460         tcp_cleanup_congestion_control(tp);
1461
1462         /* Cleanup up the write buffer. */
1463         sk_stream_writequeue_purge(sk);
1464
1465         /* Cleans up our, hopefully empty, out_of_order_queue. */
1466         __skb_queue_purge(&tp->out_of_order_queue);
1467
1468         /* Clean prequeue, it must be empty really */
1469         __skb_queue_purge(&tp->ucopy.prequeue);
1470
1471         /* Clean up a referenced TCP bind bucket. */
1472         if (inet_csk(sk)->icsk_bind_hash)
1473                 inet_put_port(&tcp_hashinfo, sk);
1474
1475         /*
1476          * If sendmsg cached page exists, toss it.
1477          */
1478         if (sk->sk_sndmsg_page) {
1479                 __free_page(sk->sk_sndmsg_page);
1480                 sk->sk_sndmsg_page = NULL;
1481         }
1482
1483         atomic_dec(&tcp_sockets_allocated);
1484
1485         return 0;
1486 }
1487
1488 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1489
1490 #ifdef CONFIG_PROC_FS
1491 /* Proc filesystem TCP sock list dumping. */
1492
1493 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1494 {
1495         return hlist_empty(head) ? NULL :
1496                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1497 }
1498
1499 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1500 {
1501         return tw->tw_node.next ?
1502                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1503 }
1504
1505 static void *listening_get_next(struct seq_file *seq, void *cur)
1506 {
1507         struct inet_connection_sock *icsk;
1508         struct hlist_node *node;
1509         struct sock *sk = cur;
1510         struct tcp_iter_state* st = seq->private;
1511
1512         if (!sk) {
1513                 st->bucket = 0;
1514                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1515                 goto get_sk;
1516         }
1517
1518         ++st->num;
1519
1520         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1521                 struct request_sock *req = cur;
1522
1523                 icsk = inet_csk(st->syn_wait_sk);
1524                 req = req->dl_next;
1525                 while (1) {
1526                         while (req) {
1527                                 if (req->rsk_ops->family == st->family) {
1528                                         cur = req;
1529                                         goto out;
1530                                 }
1531                                 req = req->dl_next;
1532                         }
1533                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
1534                                 break;
1535 get_req:
1536                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1537                 }
1538                 sk        = sk_next(st->syn_wait_sk);
1539                 st->state = TCP_SEQ_STATE_LISTENING;
1540                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1541         } else {
1542                 icsk = inet_csk(sk);
1543                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1544                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1545                         goto start_req;
1546                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1547                 sk = sk_next(sk);
1548         }
1549 get_sk:
1550         sk_for_each_from(sk, node) {
1551                 if (sk->sk_family == st->family) {
1552                         cur = sk;
1553                         goto out;
1554                 }
1555                 icsk = inet_csk(sk);
1556                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1557                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1558 start_req:
1559                         st->uid         = sock_i_uid(sk);
1560                         st->syn_wait_sk = sk;
1561                         st->state       = TCP_SEQ_STATE_OPENREQ;
1562                         st->sbucket     = 0;
1563                         goto get_req;
1564                 }
1565                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1566         }
1567         if (++st->bucket < INET_LHTABLE_SIZE) {
1568                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1569                 goto get_sk;
1570         }
1571         cur = NULL;
1572 out:
1573         return cur;
1574 }
1575
1576 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1577 {
1578         void *rc = listening_get_next(seq, NULL);
1579
1580         while (rc && *pos) {
1581                 rc = listening_get_next(seq, rc);
1582                 --*pos;
1583         }
1584         return rc;
1585 }
1586
1587 static void *established_get_first(struct seq_file *seq)
1588 {
1589         struct tcp_iter_state* st = seq->private;
1590         void *rc = NULL;
1591
1592         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1593                 struct sock *sk;
1594                 struct hlist_node *node;
1595                 struct inet_timewait_sock *tw;
1596
1597                 /* We can reschedule _before_ having picked the target: */
1598                 cond_resched_softirq();
1599
1600                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1601                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1602                         if (sk->sk_family != st->family) {
1603                                 continue;
1604                         }
1605                         rc = sk;
1606                         goto out;
1607                 }
1608                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1609                 inet_twsk_for_each(tw, node,
1610                                    &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1611                         if (tw->tw_family != st->family) {
1612                                 continue;
1613                         }
1614                         rc = tw;
1615                         goto out;
1616                 }
1617                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1618                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1619         }
1620 out:
1621         return rc;
1622 }
1623
1624 static void *established_get_next(struct seq_file *seq, void *cur)
1625 {
1626         struct sock *sk = cur;
1627         struct inet_timewait_sock *tw;
1628         struct hlist_node *node;
1629         struct tcp_iter_state* st = seq->private;
1630
1631         ++st->num;
1632
1633         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1634                 tw = cur;
1635                 tw = tw_next(tw);
1636 get_tw:
1637                 while (tw && tw->tw_family != st->family) {
1638                         tw = tw_next(tw);
1639                 }
1640                 if (tw) {
1641                         cur = tw;
1642                         goto out;
1643                 }
1644                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1645                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1646
1647                 /* We can reschedule between buckets: */
1648                 cond_resched_softirq();
1649
1650                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1651                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1652                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1653                 } else {
1654                         cur = NULL;
1655                         goto out;
1656                 }
1657         } else
1658                 sk = sk_next(sk);
1659
1660         sk_for_each_from(sk, node) {
1661                 if (sk->sk_family == st->family)
1662                         goto found;
1663         }
1664
1665         st->state = TCP_SEQ_STATE_TIME_WAIT;
1666         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1667         goto get_tw;
1668 found:
1669         cur = sk;
1670 out:
1671         return cur;
1672 }
1673
1674 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1675 {
1676         void *rc = established_get_first(seq);
1677
1678         while (rc && pos) {
1679                 rc = established_get_next(seq, rc);
1680                 --pos;
1681         }
1682         return rc;
1683 }
1684
1685 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1686 {
1687         void *rc;
1688         struct tcp_iter_state* st = seq->private;
1689
1690         inet_listen_lock(&tcp_hashinfo);
1691         st->state = TCP_SEQ_STATE_LISTENING;
1692         rc        = listening_get_idx(seq, &pos);
1693
1694         if (!rc) {
1695                 inet_listen_unlock(&tcp_hashinfo);
1696                 local_bh_disable();
1697                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1698                 rc        = established_get_idx(seq, pos);
1699         }
1700
1701         return rc;
1702 }
1703
1704 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1705 {
1706         struct tcp_iter_state* st = seq->private;
1707         st->state = TCP_SEQ_STATE_LISTENING;
1708         st->num = 0;
1709         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1710 }
1711
1712 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1713 {
1714         void *rc = NULL;
1715         struct tcp_iter_state* st;
1716
1717         if (v == SEQ_START_TOKEN) {
1718                 rc = tcp_get_idx(seq, 0);
1719                 goto out;
1720         }
1721         st = seq->private;
1722
1723         switch (st->state) {
1724         case TCP_SEQ_STATE_OPENREQ:
1725         case TCP_SEQ_STATE_LISTENING:
1726                 rc = listening_get_next(seq, v);
1727                 if (!rc) {
1728                         inet_listen_unlock(&tcp_hashinfo);
1729                         local_bh_disable();
1730                         st->state = TCP_SEQ_STATE_ESTABLISHED;
1731                         rc        = established_get_first(seq);
1732                 }
1733                 break;
1734         case TCP_SEQ_STATE_ESTABLISHED:
1735         case TCP_SEQ_STATE_TIME_WAIT:
1736                 rc = established_get_next(seq, v);
1737                 break;
1738         }
1739 out:
1740         ++*pos;
1741         return rc;
1742 }
1743
1744 static void tcp_seq_stop(struct seq_file *seq, void *v)
1745 {
1746         struct tcp_iter_state* st = seq->private;
1747
1748         switch (st->state) {
1749         case TCP_SEQ_STATE_OPENREQ:
1750                 if (v) {
1751                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1752                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1753                 }
1754         case TCP_SEQ_STATE_LISTENING:
1755                 if (v != SEQ_START_TOKEN)
1756                         inet_listen_unlock(&tcp_hashinfo);
1757                 break;
1758         case TCP_SEQ_STATE_TIME_WAIT:
1759         case TCP_SEQ_STATE_ESTABLISHED:
1760                 if (v)
1761                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1762                 local_bh_enable();
1763                 break;
1764         }
1765 }
1766
1767 static int tcp_seq_open(struct inode *inode, struct file *file)
1768 {
1769         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1770         struct seq_file *seq;
1771         struct tcp_iter_state *s;
1772         int rc;
1773
1774         if (unlikely(afinfo == NULL))
1775                 return -EINVAL;
1776
1777         s = kmalloc(sizeof(*s), GFP_KERNEL);
1778         if (!s)
1779                 return -ENOMEM;
1780         memset(s, 0, sizeof(*s));
1781         s->family               = afinfo->family;
1782         s->seq_ops.start        = tcp_seq_start;
1783         s->seq_ops.next         = tcp_seq_next;
1784         s->seq_ops.show         = afinfo->seq_show;
1785         s->seq_ops.stop         = tcp_seq_stop;
1786
1787         rc = seq_open(file, &s->seq_ops);
1788         if (rc)
1789                 goto out_kfree;
1790         seq          = file->private_data;
1791         seq->private = s;
1792 out:
1793         return rc;
1794 out_kfree:
1795         kfree(s);
1796         goto out;
1797 }
1798
1799 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1800 {
1801         int rc = 0;
1802         struct proc_dir_entry *p;
1803
1804         if (!afinfo)
1805                 return -EINVAL;
1806         afinfo->seq_fops->owner         = afinfo->owner;
1807         afinfo->seq_fops->open          = tcp_seq_open;
1808         afinfo->seq_fops->read          = seq_read;
1809         afinfo->seq_fops->llseek        = seq_lseek;
1810         afinfo->seq_fops->release       = seq_release_private;
1811
1812         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1813         if (p)
1814                 p->data = afinfo;
1815         else
1816                 rc = -ENOMEM;
1817         return rc;
1818 }
1819
1820 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1821 {
1822         if (!afinfo)
1823                 return;
1824         proc_net_remove(afinfo->name);
1825         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1826 }
1827
1828 static void get_openreq4(struct sock *sk, struct request_sock *req,
1829                          char *tmpbuf, int i, int uid)
1830 {
1831         const struct inet_request_sock *ireq = inet_rsk(req);
1832         int ttd = req->expires - jiffies;
1833
1834         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1835                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1836                 i,
1837                 ireq->loc_addr,
1838                 ntohs(inet_sk(sk)->sport),
1839                 ireq->rmt_addr,
1840                 ntohs(ireq->rmt_port),
1841                 TCP_SYN_RECV,
1842                 0, 0, /* could print option size, but that is af dependent. */
1843                 1,    /* timers active (only the expire timer) */
1844                 jiffies_to_clock_t(ttd),
1845                 req->retrans,
1846                 uid,
1847                 0,  /* non standard timer */
1848                 0, /* open_requests have no inode */
1849                 atomic_read(&sk->sk_refcnt),
1850                 req);
1851 }
1852
1853 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1854 {
1855         int timer_active;
1856         unsigned long timer_expires;
1857         struct tcp_sock *tp = tcp_sk(sp);
1858         const struct inet_connection_sock *icsk = inet_csk(sp);
1859         struct inet_sock *inet = inet_sk(sp);
1860         unsigned int dest = inet->daddr;
1861         unsigned int src = inet->rcv_saddr;
1862         __u16 destp = ntohs(inet->dport);
1863         __u16 srcp = ntohs(inet->sport);
1864
1865         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1866                 timer_active    = 1;
1867                 timer_expires   = icsk->icsk_timeout;
1868         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1869                 timer_active    = 4;
1870                 timer_expires   = icsk->icsk_timeout;
1871         } else if (timer_pending(&sp->sk_timer)) {
1872                 timer_active    = 2;
1873                 timer_expires   = sp->sk_timer.expires;
1874         } else {
1875                 timer_active    = 0;
1876                 timer_expires = jiffies;
1877         }
1878
1879         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1880                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
1881                 i, src, srcp, dest, destp, sp->sk_state,
1882                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
1883                 timer_active,
1884                 jiffies_to_clock_t(timer_expires - jiffies),
1885                 icsk->icsk_retransmits,
1886                 sock_i_uid(sp),
1887                 tp->probes_out,
1888                 sock_i_ino(sp),
1889                 atomic_read(&sp->sk_refcnt), sp,
1890                 icsk->icsk_rto,
1891                 icsk->icsk_ack.ato,
1892                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1893                 tp->snd_cwnd,
1894                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1895 }
1896
1897 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1898 {
1899         unsigned int dest, src;
1900         __u16 destp, srcp;
1901         int ttd = tw->tw_ttd - jiffies;
1902
1903         if (ttd < 0)
1904                 ttd = 0;
1905
1906         dest  = tw->tw_daddr;
1907         src   = tw->tw_rcv_saddr;
1908         destp = ntohs(tw->tw_dport);
1909         srcp  = ntohs(tw->tw_sport);
1910
1911         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1912                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1913                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1914                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1915                 atomic_read(&tw->tw_refcnt), tw);
1916 }
1917
1918 #define TMPSZ 150
1919
1920 static int tcp4_seq_show(struct seq_file *seq, void *v)
1921 {
1922         struct tcp_iter_state* st;
1923         char tmpbuf[TMPSZ + 1];
1924
1925         if (v == SEQ_START_TOKEN) {
1926                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
1927                            "  sl  local_address rem_address   st tx_queue "
1928                            "rx_queue tr tm->when retrnsmt   uid  timeout "
1929                            "inode");
1930                 goto out;
1931         }
1932         st = seq->private;
1933
1934         switch (st->state) {
1935         case TCP_SEQ_STATE_LISTENING:
1936         case TCP_SEQ_STATE_ESTABLISHED:
1937                 get_tcp4_sock(v, tmpbuf, st->num);
1938                 break;
1939         case TCP_SEQ_STATE_OPENREQ:
1940                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1941                 break;
1942         case TCP_SEQ_STATE_TIME_WAIT:
1943                 get_timewait4_sock(v, tmpbuf, st->num);
1944                 break;
1945         }
1946         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1947 out:
1948         return 0;
1949 }
1950
1951 static struct file_operations tcp4_seq_fops;
1952 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1953         .owner          = THIS_MODULE,
1954         .name           = "tcp",
1955         .family         = AF_INET,
1956         .seq_show       = tcp4_seq_show,
1957         .seq_fops       = &tcp4_seq_fops,
1958 };
1959
1960 int __init tcp4_proc_init(void)
1961 {
1962         return tcp_proc_register(&tcp4_seq_afinfo);
1963 }
1964
1965 void tcp4_proc_exit(void)
1966 {
1967         tcp_proc_unregister(&tcp4_seq_afinfo);
1968 }
1969 #endif /* CONFIG_PROC_FS */
1970
1971 struct proto tcp_prot = {
1972         .name                   = "TCP",
1973         .owner                  = THIS_MODULE,
1974         .close                  = tcp_close,
1975         .connect                = tcp_v4_connect,
1976         .disconnect             = tcp_disconnect,
1977         .accept                 = inet_csk_accept,
1978         .ioctl                  = tcp_ioctl,
1979         .init                   = tcp_v4_init_sock,
1980         .destroy                = tcp_v4_destroy_sock,
1981         .shutdown               = tcp_shutdown,
1982         .setsockopt             = tcp_setsockopt,
1983         .getsockopt             = tcp_getsockopt,
1984         .sendmsg                = tcp_sendmsg,
1985         .recvmsg                = tcp_recvmsg,
1986         .backlog_rcv            = tcp_v4_do_rcv,
1987         .hash                   = tcp_v4_hash,
1988         .unhash                 = tcp_unhash,
1989         .get_port               = tcp_v4_get_port,
1990         .enter_memory_pressure  = tcp_enter_memory_pressure,
1991         .sockets_allocated      = &tcp_sockets_allocated,
1992         .orphan_count           = &tcp_orphan_count,
1993         .memory_allocated       = &tcp_memory_allocated,
1994         .memory_pressure        = &tcp_memory_pressure,
1995         .sysctl_mem             = sysctl_tcp_mem,
1996         .sysctl_wmem            = sysctl_tcp_wmem,
1997         .sysctl_rmem            = sysctl_tcp_rmem,
1998         .max_header             = MAX_TCP_HEADER,
1999         .obj_size               = sizeof(struct tcp_sock),
2000         .twsk_obj_size          = sizeof(struct tcp_timewait_sock),
2001         .rsk_prot               = &tcp_request_sock_ops,
2002 };
2003
2004
2005
2006 void __init tcp_v4_init(struct net_proto_family *ops)
2007 {
2008         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2009         if (err < 0)
2010                 panic("Failed to create the TCP control socket.\n");
2011         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2012         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2013
2014         /* Unhash it so that IP input processing does not even
2015          * see it, we do not wish this socket to see incoming
2016          * packets.
2017          */
2018         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2019 }
2020
2021 EXPORT_SYMBOL(ipv4_specific);
2022 EXPORT_SYMBOL(inet_bind_bucket_create);
2023 EXPORT_SYMBOL(tcp_hashinfo);
2024 EXPORT_SYMBOL(tcp_prot);
2025 EXPORT_SYMBOL(tcp_unhash);
2026 EXPORT_SYMBOL(tcp_v4_conn_request);
2027 EXPORT_SYMBOL(tcp_v4_connect);
2028 EXPORT_SYMBOL(tcp_v4_do_rcv);
2029 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2030 EXPORT_SYMBOL(tcp_v4_send_check);
2031 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2032
2033 #ifdef CONFIG_PROC_FS
2034 EXPORT_SYMBOL(tcp_proc_register);
2035 EXPORT_SYMBOL(tcp_proc_unregister);
2036 #endif
2037 EXPORT_SYMBOL(sysctl_local_port_range);
2038 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2039 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2040