SAFE public projects git trees. - safe/jmp/linux-2.6/blob - net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      request_sock handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen semantics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/xfrm.h>
  73
  74 #include <linux/inet.h>
  75 #include <linux/ipv6.h>
  76 #include <linux/stddef.h>
  77 #include <linux/proc_fs.h>
  78 #include <linux/seq_file.h>
  79
  80 int sysctl_tcp_tw_reuse;
  81 int sysctl_tcp_low_latency;
  82
  83 /* Check TCP sequence numbers in ICMP packets. */
  84 #define ICMP_MIN_LENGTH 8
  85
  86 /* Socket used for sending RSTs */
  87 static struct socket *tcp_socket;
  88
  89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  90                        struct sk_buff *skb);
  91
  92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
  93         .lhash_lock     = RW_LOCK_UNLOCKED,
  94         .lhash_users    = ATOMIC_INIT(0),
  95         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
  96 };
  97
  98 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
  99 {
 100         return inet_csk_get_port(&tcp_hashinfo, sk, snum,
 101                                  inet_csk_bind_conflict);
 102 }
 103
 104 static void tcp_v4_hash(struct sock *sk)
 105 {
 106         inet_hash(&tcp_hashinfo, sk);
 107 }
 108
 109 void tcp_unhash(struct sock *sk)
 110 {
 111         inet_unhash(&tcp_hashinfo, sk);
 112 }
 113
 114 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 115 {
 116         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 117                                           skb->nh.iph->saddr,
 118                                           skb->h.th->dest,
 119                                           skb->h.th->source);
 120 }
 121
 122 /* called with local bh disabled */
 123 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 124                                       struct inet_timewait_sock **twp)
 125 {
 126         struct inet_sock *inet = inet_sk(sk);
 127         u32 daddr = inet->rcv_saddr;
 128         u32 saddr = inet->daddr;
 129         int dif = sk->sk_bound_dev_if;
 130         INET_ADDR_COOKIE(acookie, saddr, daddr)
 131         const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
 132         unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
 133         struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
 134         struct sock *sk2;
 135         const struct hlist_node *node;
 136         struct inet_timewait_sock *tw;
 137
 138         prefetch(head->chain.first);
 139         write_lock(&head->lock);
 140
 141         /* Check TIME-WAIT sockets first. */
 142         sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
 143                 tw = inet_twsk(sk2);
 144
 145                 if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
 146                         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
 147                         struct tcp_sock *tp = tcp_sk(sk);
 148
 149                         /* With PAWS, it is safe from the viewpoint
 150                            of data integrity. Even without PAWS it
 151                            is safe provided sequence spaces do not
 152                            overlap i.e. at data rates <= 80Mbit/sec.
 153
 154                            Actually, the idea is close to VJ's one,
 155                            only timestamp cache is held not per host,
 156                            but per port pair and TW bucket is used
 157                            as state holder.
 158
 159                            If TW bucket has been already destroyed we
 160                            fall back to VJ's scheme and use initial
 161                            timestamp retrieved from peer table.
 162                          */
 163                         if (tcptw->tw_ts_recent_stamp &&
 164                             (!twp || (sysctl_tcp_tw_reuse &&
 165                                       xtime.tv_sec -
 166                                       tcptw->tw_ts_recent_stamp > 1))) {
 167                                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 168                                 if (tp->write_seq == 0)
 169                                         tp->write_seq = 1;
 170                                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 171                                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 172                                 sock_hold(sk2);
 173                                 goto unique;
 174                         } else
 175                                 goto not_unique;
 176                 }
 177         }
 178         tw = NULL;
 179
 180         /* And established part... */
 181         sk_for_each(sk2, node, &head->chain) {
 182                 if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
 183                         goto not_unique;
 184         }
 185
 186 unique:
 187         /* Must record num and sport now. Otherwise we will see
 188          * in hash table socket with a funny identity. */
 189         inet->num = lport;
 190         inet->sport = htons(lport);
 191         sk->sk_hash = hash;
 192         BUG_TRAP(sk_unhashed(sk));
 193         __sk_add_node(sk, &head->chain);
 194         sock_prot_inc_use(sk->sk_prot);
 195         write_unlock(&head->lock);
 196
 197         if (twp) {
 198                 *twp = tw;
 199                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 200         } else if (tw) {
 201                 /* Silly. Should hash-dance instead... */
 202                 inet_twsk_deschedule(tw, &tcp_death_row);
 203                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 204
 205                 inet_twsk_put(tw);
 206         }
 207
 208         return 0;
 209
 210 not_unique:
 211         write_unlock(&head->lock);
 212         return -EADDRNOTAVAIL;
 213 }
 214
 215 static inline u32 connect_port_offset(const struct sock *sk)
 216 {
 217         const struct inet_sock *inet = inet_sk(sk);
 218
 219         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
 220                                          inet->dport);
 221 }
 222
 223 /*
 224  * Bind a port for a connect operation and hash it.
 225  */
 226 static inline int tcp_v4_hash_connect(struct sock *sk)
 227 {
 228         const unsigned short snum = inet_sk(sk)->num;
 229         struct inet_bind_hashbucket *head;
 230         struct inet_bind_bucket *tb;
 231         int ret;
 232
 233         if (!snum) {
 234                 int low = sysctl_local_port_range[0];
 235                 int high = sysctl_local_port_range[1];
 236                 int range = high - low;
 237                 int i;
 238                 int port;
 239                 static u32 hint;
 240                 u32 offset = hint + connect_port_offset(sk);
 241                 struct hlist_node *node;
 242                 struct inet_timewait_sock *tw = NULL;
 243
 244                 local_bh_disable();
 245                 for (i = 1; i <= range; i++) {
 246                         port = low + (i + offset) % range;
 247                         head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
 248                         spin_lock(&head->lock);
 249
 250                         /* Does not bother with rcv_saddr checks,
 251                          * because the established check is already
 252                          * unique enough.
 253                          */
 254                         inet_bind_bucket_for_each(tb, node, &head->chain) {
 255                                 if (tb->port == port) {
 256                                         BUG_TRAP(!hlist_empty(&tb->owners));
 257                                         if (tb->fastreuse >= 0)
 258                                                 goto next_port;
 259                                         if (!__tcp_v4_check_established(sk,
 260                                                                         port,
 261                                                                         &tw))
 262                                                 goto ok;
 263                                         goto next_port;
 264                                 }
 265                         }
 266
 267                         tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
 268                         if (!tb) {
 269                                 spin_unlock(&head->lock);
 270                                 break;
 271                         }
 272                         tb->fastreuse = -1;
 273                         goto ok;
 274
 275                 next_port:
 276                         spin_unlock(&head->lock);
 277                 }
 278                 local_bh_enable();
 279
 280                 return -EADDRNOTAVAIL;
 281
 282 ok:
 283                 hint += i;
 284
 285                 /* Head lock still held and bh's disabled */
 286                 inet_bind_hash(sk, tb, port);
 287                 if (sk_unhashed(sk)) {
 288                         inet_sk(sk)->sport = htons(port);
 289                         __inet_hash(&tcp_hashinfo, sk, 0);
 290                 }
 291                 spin_unlock(&head->lock);
 292
 293                 if (tw) {
 294                         inet_twsk_deschedule(tw, &tcp_death_row);;
 295                         inet_twsk_put(tw);
 296                 }
 297
 298                 ret = 0;
 299                 goto out;
 300         }
 301
 302         head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
 303         tb  = inet_csk(sk)->icsk_bind_hash;
 304         spin_lock_bh(&head->lock);
 305         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 306                 __inet_hash(&tcp_hashinfo, sk, 0);
 307                 spin_unlock_bh(&head->lock);
 308                 return 0;
 309         } else {
 310                 spin_unlock(&head->lock);
 311                 /* No definite answer... Walk to established hash table */
 312                 ret = __tcp_v4_check_established(sk, snum, NULL);
 313 out:
 314                 local_bh_enable();
 315                 return ret;
 316         }
 317 }
 318
 319 /* This will initiate an outgoing connection. */
 320 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 321 {
 322         struct inet_sock *inet = inet_sk(sk);
 323         struct tcp_sock *tp = tcp_sk(sk);
 324         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 325         struct rtable *rt;
 326         u32 daddr, nexthop;
 327         int tmp;
 328         int err;
 329
 330         if (addr_len < sizeof(struct sockaddr_in))
 331                 return -EINVAL;
 332
 333         if (usin->sin_family != AF_INET)
 334                 return -EAFNOSUPPORT;
 335
 336         nexthop = daddr = usin->sin_addr.s_addr;
 337         if (inet->opt && inet->opt->srr) {
 338                 if (!daddr)
 339                         return -EINVAL;
 340                 nexthop = inet->opt->faddr;
 341         }
 342
 343         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 344                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 345                                IPPROTO_TCP,
 346                                inet->sport, usin->sin_port, sk);
 347         if (tmp < 0)
 348                 return tmp;
 349
 350         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 351                 ip_rt_put(rt);
 352                 return -ENETUNREACH;
 353         }
 354
 355         if (!inet->opt || !inet->opt->srr)
 356                 daddr = rt->rt_dst;
 357
 358         if (!inet->saddr)
 359                 inet->saddr = rt->rt_src;
 360         inet->rcv_saddr = inet->saddr;
 361
 362         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 363                 /* Reset inherited state */
 364                 tp->rx_opt.ts_recent       = 0;
 365                 tp->rx_opt.ts_recent_stamp = 0;
 366                 tp->write_seq              = 0;
 367         }
 368
 369         if (tcp_death_row.sysctl_tw_recycle &&
 370             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 371                 struct inet_peer *peer = rt_get_peer(rt);
 372
 373                 /* VJ's idea. We save last timestamp seen from
 374                  * the destination in peer table, when entering state TIME-WAIT
 375                  * and initialize rx_opt.ts_recent from it, when trying new connection.
 376                  */
 377
 378                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 379                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 380                         tp->rx_opt.ts_recent = peer->tcp_ts;
 381                 }
 382         }
 383
 384         inet->dport = usin->sin_port;
 385         inet->daddr = daddr;
 386
 387         tp->ext_header_len = 0;
 388         if (inet->opt)
 389                 tp->ext_header_len = inet->opt->optlen;
 390
 391         tp->rx_opt.mss_clamp = 536;
 392
 393         /* Socket identity is still unknown (sport may be zero).
 394          * However we set state to SYN-SENT and not releasing socket
 395          * lock select source port, enter ourselves into the hash tables and
 396          * complete initialization after this.
 397          */
 398         tcp_set_state(sk, TCP_SYN_SENT);
 399         err = tcp_v4_hash_connect(sk);
 400         if (err)
 401                 goto failure;
 402
 403         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 404         if (err)
 405                 goto failure;
 406
 407         /* OK, now commit destination to socket.  */
 408         sk_setup_caps(sk, &rt->u.dst);
 409
 410         if (!tp->write_seq)
 411                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 412                                                            inet->daddr,
 413                                                            inet->sport,
 414                                                            usin->sin_port);
 415
 416         inet->id = tp->write_seq ^ jiffies;
 417
 418         err = tcp_connect(sk);
 419         rt = NULL;
 420         if (err)
 421                 goto failure;
 422
 423         return 0;
 424
 425 failure:
 426         /* This unhashes the socket and releases the local port, if necessary. */
 427         tcp_set_state(sk, TCP_CLOSE);
 428         ip_rt_put(rt);
 429         sk->sk_route_caps = 0;
 430         inet->dport = 0;
 431         return err;
 432 }
 433
 434 /*
 435  * This routine does path mtu discovery as defined in RFC1191.
 436  */
 437 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 438                                      u32 mtu)
 439 {
 440         struct dst_entry *dst;
 441         struct inet_sock *inet = inet_sk(sk);
 442         struct tcp_sock *tp = tcp_sk(sk);
 443
 444         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 445          * send out by Linux are always <576bytes so they should go through
 446          * unfragmented).
 447          */
 448         if (sk->sk_state == TCP_LISTEN)
 449                 return;
 450
 451         /* We don't check in the destentry if pmtu discovery is forbidden
 452          * on this route. We just assume that no packet_to_big packets
 453          * are send back when pmtu discovery is not active.
 454          * There is a small race when the user changes this flag in the
 455          * route, but I think that's acceptable.
 456          */
 457         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 458                 return;
 459
 460         dst->ops->update_pmtu(dst, mtu);
 461
 462         /* Something is about to be wrong... Remember soft error
 463          * for the case, if this connection will not able to recover.
 464          */
 465         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 466                 sk->sk_err_soft = EMSGSIZE;
 467
 468         mtu = dst_mtu(dst);
 469
 470         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 471             tp->pmtu_cookie > mtu) {
 472                 tcp_sync_mss(sk, mtu);
 473
 474                 /* Resend the TCP packet because it's
 475                  * clear that the old packet has been
 476                  * dropped. This is the new "fast" path mtu
 477                  * discovery.
 478                  */
 479                 tcp_simple_retransmit(sk);
 480         } /* else let the usual retransmit timer handle it */
 481 }
 482
 483 /*
 484  * This routine is called by the ICMP module when it gets some
 485  * sort of error condition.  If err < 0 then the socket should
 486  * be closed and the error returned to the user.  If err > 0
 487  * it's just the icmp type << 8 | icmp code.  After adjustment
 488  * header points to the first 8 bytes of the tcp header.  We need
 489  * to find the appropriate port.
 490  *
 491  * The locking strategy used here is very "optimistic". When
 492  * someone else accesses the socket the ICMP is just dropped
 493  * and for some paths there is no check at all.
 494  * A more general error queue to queue errors for later handling
 495  * is probably better.
 496  *
 497  */
 498
 499 void tcp_v4_err(struct sk_buff *skb, u32 info)
 500 {
 501         struct iphdr *iph = (struct iphdr *)skb->data;
 502         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 503         struct tcp_sock *tp;
 504         struct inet_sock *inet;
 505         int type = skb->h.icmph->type;
 506         int code = skb->h.icmph->code;
 507         struct sock *sk;
 508         __u32 seq;
 509         int err;
 510
 511         if (skb->len < (iph->ihl << 2) + 8) {
 512                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 513                 return;
 514         }
 515
 516         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
 517                          th->source, inet_iif(skb));
 518         if (!sk) {
 519                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 520                 return;
 521         }
 522         if (sk->sk_state == TCP_TIME_WAIT) {
 523                 inet_twsk_put((struct inet_timewait_sock *)sk);
 524                 return;
 525         }
 526
 527         bh_lock_sock(sk);
 528         /* If too many ICMPs get dropped on busy
 529          * servers this needs to be solved differently.
 530          */
 531         if (sock_owned_by_user(sk))
 532                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 533
 534         if (sk->sk_state == TCP_CLOSE)
 535                 goto out;
 536
 537         tp = tcp_sk(sk);
 538         seq = ntohl(th->seq);
 539         if (sk->sk_state != TCP_LISTEN &&
 540             !between(seq, tp->snd_una, tp->snd_nxt)) {
 541                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
 542                 goto out;
 543         }
 544
 545         switch (type) {
 546         case ICMP_SOURCE_QUENCH:
 547                 /* Just silently ignore these. */
 548                 goto out;
 549         case ICMP_PARAMETERPROB:
 550                 err = EPROTO;
 551                 break;
 552         case ICMP_DEST_UNREACH:
 553                 if (code > NR_ICMP_UNREACH)
 554                         goto out;
 555
 556                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 557                         if (!sock_owned_by_user(sk))
 558                                 do_pmtu_discovery(sk, iph, info);
 559                         goto out;
 560                 }
 561
 562                 err = icmp_err_convert[code].errno;
 563                 break;
 564         case ICMP_TIME_EXCEEDED:
 565                 err = EHOSTUNREACH;
 566                 break;
 567         default:
 568                 goto out;
 569         }
 570
 571         switch (sk->sk_state) {
 572                 struct request_sock *req, **prev;
 573         case TCP_LISTEN:
 574                 if (sock_owned_by_user(sk))
 575                         goto out;
 576
 577                 req = inet_csk_search_req(sk, &prev, th->dest,
 578                                           iph->daddr, iph->saddr);
 579                 if (!req)
 580                         goto out;
 581
 582                 /* ICMPs are not backlogged, hence we cannot get
 583                    an established socket here.
 584                  */
 585                 BUG_TRAP(!req->sk);
 586
 587                 if (seq != tcp_rsk(req)->snt_isn) {
 588                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 589                         goto out;
 590                 }
 591
 592                 /*
 593                  * Still in SYN_RECV, just remove it silently.
 594                  * There is no good way to pass the error to the newly
 595                  * created socket, and POSIX does not want network
 596                  * errors returned from accept().
 597                  */
 598                 inet_csk_reqsk_queue_drop(sk, req, prev);
 599                 goto out;
 600
 601         case TCP_SYN_SENT:
 602         case TCP_SYN_RECV:  /* Cannot happen.
 603                                It can f.e. if SYNs crossed.
 604                              */
 605                 if (!sock_owned_by_user(sk)) {
 606                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
 607                         sk->sk_err = err;
 608
 609                         sk->sk_error_report(sk);
 610
 611                         tcp_done(sk);
 612                 } else {
 613                         sk->sk_err_soft = err;
 614                 }
 615                 goto out;
 616         }
 617
 618         /* If we've already connected we will keep trying
 619          * until we time out, or the user gives up.
 620          *
 621          * rfc1122 4.2.3.9 allows to consider as hard errors
 622          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 623          * but it is obsoleted by pmtu discovery).
 624          *
 625          * Note, that in modern internet, where routing is unreliable
 626          * and in each dark corner broken firewalls sit, sending random
 627          * errors ordered by their masters even this two messages finally lose
 628          * their original sense (even Linux sends invalid PORT_UNREACHs)
 629          *
 630          * Now we are in compliance with RFCs.
 631          *                                                      --ANK (980905)
 632          */
 633
 634         inet = inet_sk(sk);
 635         if (!sock_owned_by_user(sk) && inet->recverr) {
 636                 sk->sk_err = err;
 637                 sk->sk_error_report(sk);
 638         } else  { /* Only an error on timeout */
 639                 sk->sk_err_soft = err;
 640         }
 641
 642 out:
 643         bh_unlock_sock(sk);
 644         sock_put(sk);
 645 }
 646
 647 /* This routine computes an IPv4 TCP checksum. */
 648 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
 649                        struct sk_buff *skb)
 650 {
 651         struct inet_sock *inet = inet_sk(sk);
 652
 653         if (skb->ip_summed == CHECKSUM_HW) {
 654                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
 655                 skb->csum = offsetof(struct tcphdr, check);
 656         } else {
 657                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
 658                                          csum_partial((char *)th,
 659                                                       th->doff << 2,
 660                                                       skb->csum));
 661         }
 662 }
 663
 664 /*
 665  *      This routine will send an RST to the other tcp.
 666  *
 667  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 668  *                    for reset.
 669  *      Answer: if a packet caused RST, it is not for a socket
 670  *              existing in our system, if it is matched to a socket,
 671  *              it is just duplicate segment or bug in other side's TCP.
 672  *              So that we build reply only basing on parameters
 673  *              arrived with segment.
 674  *      Exception: precedence violation. We do not implement it in any case.
 675  */
 676
 677 static void tcp_v4_send_reset(struct sk_buff *skb)
 678 {
 679         struct tcphdr *th = skb->h.th;
 680         struct tcphdr rth;
 681         struct ip_reply_arg arg;
 682
 683         /* Never send a reset in response to a reset. */
 684         if (th->rst)
 685                 return;
 686
 687         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
 688                 return;
 689
 690         /* Swap the send and the receive. */
 691         memset(&rth, 0, sizeof(struct tcphdr));
 692         rth.dest   = th->source;
 693         rth.source = th->dest;
 694         rth.doff   = sizeof(struct tcphdr) / 4;
 695         rth.rst    = 1;
 696
 697         if (th->ack) {
 698                 rth.seq = th->ack_seq;
 699         } else {
 700                 rth.ack = 1;
 701                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 702                                     skb->len - (th->doff << 2));
 703         }
 704
 705         memset(&arg, 0, sizeof arg);
 706         arg.iov[0].iov_base = (unsigned char *)&rth;
 707         arg.iov[0].iov_len  = sizeof rth;
 708         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
 709                                       skb->nh.iph->saddr, /*XXX*/
 710                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
 711         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 712
 713         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
 714
 715         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 716         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
 717 }
 718
 719 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 720    outside socket context is ugly, certainly. What can I do?
 721  */
 722
 723 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 724                             u32 win, u32 ts)
 725 {
 726         struct tcphdr *th = skb->h.th;
 727         struct {
 728                 struct tcphdr th;
 729                 u32 tsopt[3];
 730         } rep;
 731         struct ip_reply_arg arg;
 732
 733         memset(&rep.th, 0, sizeof(struct tcphdr));
 734         memset(&arg, 0, sizeof arg);
 735
 736         arg.iov[0].iov_base = (unsigned char *)&rep;
 737         arg.iov[0].iov_len  = sizeof(rep.th);
 738         if (ts) {
 739                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 740                                      (TCPOPT_TIMESTAMP << 8) |
 741                                      TCPOLEN_TIMESTAMP);
 742                 rep.tsopt[1] = htonl(tcp_time_stamp);
 743                 rep.tsopt[2] = htonl(ts);
 744                 arg.iov[0].iov_len = sizeof(rep);
 745         }
 746
 747         /* Swap the send and the receive. */
 748         rep.th.dest    = th->source;
 749         rep.th.source  = th->dest;
 750         rep.th.doff    = arg.iov[0].iov_len / 4;
 751         rep.th.seq     = htonl(seq);
 752         rep.th.ack_seq = htonl(ack);
 753         rep.th.ack     = 1;
 754         rep.th.window  = htons(win);
 755
 756         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
 757                                       skb->nh.iph->saddr, /*XXX*/
 758                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 759         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 760
 761         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
 762
 763         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 764 }
 765
 766 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 767 {
 768         struct inet_timewait_sock *tw = inet_twsk(sk);
 769         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 770
 771         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 772                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
 773
 774         inet_twsk_put(tw);
 775 }
 776
 777 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
 778 {
 779         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 780                         req->ts_recent);
 781 }
 782
 783 /*
 784  *      Send a SYN-ACK after having received an ACK.
 785  *      This still operates on a request_sock only, not on a big
 786  *      socket.
 787  */
 788 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 789                               struct dst_entry *dst)
 790 {
 791         const struct inet_request_sock *ireq = inet_rsk(req);
 792         int err = -1;
 793         struct sk_buff * skb;
 794
 795         /* First, grab a route. */
 796         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 797                 goto out;
 798
 799         skb = tcp_make_synack(sk, dst, req);
 800
 801         if (skb) {
 802                 struct tcphdr *th = skb->h.th;
 803
 804                 th->check = tcp_v4_check(th, skb->len,
 805                                          ireq->loc_addr,
 806                                          ireq->rmt_addr,
 807                                          csum_partial((char *)th, skb->len,
 808                                                       skb->csum));
 809
 810                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 811                                             ireq->rmt_addr,
 812                                             ireq->opt);
 813                 if (err == NET_XMIT_CN)
 814                         err = 0;
 815         }
 816
 817 out:
 818         dst_release(dst);
 819         return err;
 820 }
 821
 822 /*
 823  *      IPv4 request_sock destructor.
 824  */
 825 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 826 {
 827         kfree(inet_rsk(req)->opt);
 828 }
 829
 830 static inline void syn_flood_warning(struct sk_buff *skb)
 831 {
 832         static unsigned long warntime;
 833
 834         if (time_after(jiffies, (warntime + HZ * 60))) {
 835                 warntime = jiffies;
 836                 printk(KERN_INFO
 837                        "possible SYN flooding on port %d. Sending cookies.\n",
 838                        ntohs(skb->h.th->dest));
 839         }
 840 }
 841
 842 /*
 843  * Save and compile IPv4 options into the request_sock if needed.
 844  */
 845 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
 846                                                      struct sk_buff *skb)
 847 {
 848         struct ip_options *opt = &(IPCB(skb)->opt);
 849         struct ip_options *dopt = NULL;
 850
 851         if (opt && opt->optlen) {
 852                 int opt_size = optlength(opt);
 853                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 854                 if (dopt) {
 855                         if (ip_options_echo(dopt, skb)) {
 856                                 kfree(dopt);
 857                                 dopt = NULL;
 858                         }
 859                 }
 860         }
 861         return dopt;
 862 }
 863
 864 struct request_sock_ops tcp_request_sock_ops = {
 865         .family         =       PF_INET,
 866         .obj_size       =       sizeof(struct tcp_request_sock),
 867         .rtx_syn_ack    =       tcp_v4_send_synack,
 868         .send_ack       =       tcp_v4_reqsk_send_ack,
 869         .destructor     =       tcp_v4_reqsk_destructor,
 870         .send_reset     =       tcp_v4_send_reset,
 871 };
 872
 873 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 874 {
 875         struct inet_request_sock *ireq;
 876         struct tcp_options_received tmp_opt;
 877         struct request_sock *req;
 878         __u32 saddr = skb->nh.iph->saddr;
 879         __u32 daddr = skb->nh.iph->daddr;
 880         __u32 isn = TCP_SKB_CB(skb)->when;
 881         struct dst_entry *dst = NULL;
 882 #ifdef CONFIG_SYN_COOKIES
 883         int want_cookie = 0;
 884 #else
 885 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
 886 #endif
 887
 888         /* Never answer to SYNs send to broadcast or multicast */
 889         if (((struct rtable *)skb->dst)->rt_flags &
 890             (RTCF_BROADCAST | RTCF_MULTICAST))
 891                 goto drop;
 892
 893         /* TW buckets are converted to open requests without
 894          * limitations, they conserve resources and peer is
 895          * evidently real one.
 896          */
 897         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
 898 #ifdef CONFIG_SYN_COOKIES
 899                 if (sysctl_tcp_syncookies) {
 900                         want_cookie = 1;
 901                 } else
 902 #endif
 903                 goto drop;
 904         }
 905
 906         /* Accept backlog is full. If we have already queued enough
 907          * of warm entries in syn queue, drop request. It is better than
 908          * clogging syn queue with openreqs with exponentially increasing
 909          * timeout.
 910          */
 911         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
 912                 goto drop;
 913
 914         req = reqsk_alloc(&tcp_request_sock_ops);
 915         if (!req)
 916                 goto drop;
 917
 918         tcp_clear_options(&tmp_opt);
 919         tmp_opt.mss_clamp = 536;
 920         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
 921
 922         tcp_parse_options(skb, &tmp_opt, 0);
 923
 924         if (want_cookie) {
 925                 tcp_clear_options(&tmp_opt);
 926                 tmp_opt.saw_tstamp = 0;
 927         }
 928
 929         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
 930                 /* Some OSes (unknown ones, but I see them on web server, which
 931                  * contains information interesting only for windows'
 932                  * users) do not send their stamp in SYN. It is easy case.
 933                  * We simply do not advertise TS support.
 934                  */
 935                 tmp_opt.saw_tstamp = 0;
 936                 tmp_opt.tstamp_ok  = 0;
 937         }
 938         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 939
 940         tcp_openreq_init(req, &tmp_opt, skb);
 941
 942         ireq = inet_rsk(req);
 943         ireq->loc_addr = daddr;
 944         ireq->rmt_addr = saddr;
 945         ireq->opt = tcp_v4_save_options(sk, skb);
 946         if (!want_cookie)
 947                 TCP_ECN_create_request(req, skb->h.th);
 948
 949         if (want_cookie) {
 950 #ifdef CONFIG_SYN_COOKIES
 951                 syn_flood_warning(skb);
 952 #endif
 953                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
 954         } else if (!isn) {
 955                 struct inet_peer *peer = NULL;
 956
 957                 /* VJ's idea. We save last timestamp seen
 958                  * from the destination in peer table, when entering
 959                  * state TIME-WAIT, and check against it before
 960                  * accepting new connection request.
 961                  *
 962                  * If "isn" is not zero, this request hit alive
 963                  * timewait bucket, so that all the necessary checks
 964                  * are made in the function processing timewait state.
 965                  */
 966                 if (tmp_opt.saw_tstamp &&
 967                     tcp_death_row.sysctl_tw_recycle &&
 968                     (dst = inet_csk_route_req(sk, req)) != NULL &&
 969                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
 970                     peer->v4daddr == saddr) {
 971                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
 972                             (s32)(peer->tcp_ts - req->ts_recent) >
 973                                                         TCP_PAWS_WINDOW) {
 974                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
 975                                 dst_release(dst);
 976                                 goto drop_and_free;
 977                         }
 978                 }
 979                 /* Kill the following clause, if you dislike this way. */
 980                 else if (!sysctl_tcp_syncookies &&
 981                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 982                           (sysctl_max_syn_backlog >> 2)) &&
 983                          (!peer || !peer->tcp_ts_stamp) &&
 984                          (!dst || !dst_metric(dst, RTAX_RTT))) {
 985                         /* Without syncookies last quarter of
 986                          * backlog is filled with destinations,
 987                          * proven to be alive.
 988                          * It means that we continue to communicate
 989                          * to destinations, already remembered
 990                          * to the moment of synflood.
 991                          */
 992                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
 993                                        "request from %u.%u.%u.%u/%u\n",
 994                                        NIPQUAD(saddr),
 995                                        ntohs(skb->h.th->source));
 996                         dst_release(dst);
 997                         goto drop_and_free;
 998                 }
 999
1000                 isn = tcp_v4_init_sequence(sk, skb);
1001         }
1002         tcp_rsk(req)->snt_isn = isn;
1003
1004         if (tcp_v4_send_synack(sk, req, dst))
1005                 goto drop_and_free;
1006
1007         if (want_cookie) {
1008                 reqsk_free(req);
1009         } else {
1010                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1011         }
1012         return 0;
1013
1014 drop_and_free:
1015         reqsk_free(req);
1016 drop:
1017         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1018         return 0;
1019 }
1020
1021
1022 /*
1023  * The three way handshake has completed - we got a valid synack -
1024  * now create the new socket.
1025  */
1026 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1027                                   struct request_sock *req,
1028                                   struct dst_entry *dst)
1029 {
1030         struct inet_request_sock *ireq;
1031         struct inet_sock *newinet;
1032         struct tcp_sock *newtp;
1033         struct sock *newsk;
1034
1035         if (sk_acceptq_is_full(sk))
1036                 goto exit_overflow;
1037
1038         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1039                 goto exit;
1040
1041         newsk = tcp_create_openreq_child(sk, req, skb);
1042         if (!newsk)
1043                 goto exit;
1044
1045         sk_setup_caps(newsk, dst);
1046
1047         newtp                 = tcp_sk(newsk);
1048         newinet               = inet_sk(newsk);
1049         ireq                  = inet_rsk(req);
1050         newinet->daddr        = ireq->rmt_addr;
1051         newinet->rcv_saddr    = ireq->loc_addr;
1052         newinet->saddr        = ireq->loc_addr;
1053         newinet->opt          = ireq->opt;
1054         ireq->opt             = NULL;
1055         newinet->mc_index     = inet_iif(skb);
1056         newinet->mc_ttl       = skb->nh.iph->ttl;
1057         newtp->ext_header_len = 0;
1058         if (newinet->opt)
1059                 newtp->ext_header_len = newinet->opt->optlen;
1060         newinet->id = newtp->write_seq ^ jiffies;
1061
1062         tcp_sync_mss(newsk, dst_mtu(dst));
1063         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1064         tcp_initialize_rcv_mss(newsk);
1065
1066         __inet_hash(&tcp_hashinfo, newsk, 0);
1067         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1068
1069         return newsk;
1070
1071 exit_overflow:
1072         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1073 exit:
1074         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1075         dst_release(dst);
1076         return NULL;
1077 }
1078
1079 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1080 {
1081         struct tcphdr *th = skb->h.th;
1082         struct iphdr *iph = skb->nh.iph;
1083         struct sock *nsk;
1084         struct request_sock **prev;
1085         /* Find possible connection requests. */
1086         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1087                                                        iph->saddr, iph->daddr);
1088         if (req)
1089                 return tcp_check_req(sk, skb, req, prev);
1090
1091         nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1092                                         th->source, skb->nh.iph->daddr,
1093                                         ntohs(th->dest), inet_iif(skb));
1094
1095         if (nsk) {
1096                 if (nsk->sk_state != TCP_TIME_WAIT) {
1097                         bh_lock_sock(nsk);
1098                         return nsk;
1099                 }
1100                 inet_twsk_put((struct inet_timewait_sock *)nsk);
1101                 return NULL;
1102         }
1103
1104 #ifdef CONFIG_SYN_COOKIES
1105         if (!th->rst && !th->syn && th->ack)
1106                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1107 #endif
1108         return sk;
1109 }
1110
1111 static int tcp_v4_checksum_init(struct sk_buff *skb)
1112 {
1113         if (skb->ip_summed == CHECKSUM_HW) {
1114                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1115                                   skb->nh.iph->daddr, skb->csum)) {
1116                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1117                         return 0;
1118                 }
1119         }
1120
1121         skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
1122                                        skb->len, IPPROTO_TCP, 0);
1123
1124         if (skb->len <= 76) {
1125                 return __skb_checksum_complete(skb);
1126         }
1127         return 0;
1128 }
1129
1130
1131 /* The socket must have it's spinlock held when we get
1132  * here.
1133  *
1134  * We have a potential double-lock case here, so even when
1135  * doing backlog processing we use the BH locking scheme.
1136  * This is because we cannot sleep with the original spinlock
1137  * held.
1138  */
1139 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1140 {
1141         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1142                 TCP_CHECK_TIMER(sk);
1143                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1144                         goto reset;
1145                 TCP_CHECK_TIMER(sk);
1146                 return 0;
1147         }
1148
1149         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1150                 goto csum_err;
1151
1152         if (sk->sk_state == TCP_LISTEN) {
1153                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1154                 if (!nsk)
1155                         goto discard;
1156
1157                 if (nsk != sk) {
1158                         if (tcp_child_process(sk, nsk, skb))
1159                                 goto reset;
1160                         return 0;
1161                 }
1162         }
1163
1164         TCP_CHECK_TIMER(sk);
1165         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1166                 goto reset;
1167         TCP_CHECK_TIMER(sk);
1168         return 0;
1169
1170 reset:
1171         tcp_v4_send_reset(skb);
1172 discard:
1173         kfree_skb(skb);
1174         /* Be careful here. If this function gets more complicated and
1175          * gcc suffers from register pressure on the x86, sk (in %ebx)
1176          * might be destroyed here. This current version compiles correctly,
1177          * but you have been warned.
1178          */
1179         return 0;
1180
1181 csum_err:
1182         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1183         goto discard;
1184 }
1185
1186 /*
1187  *      From tcp_input.c
1188  */
1189
1190 int tcp_v4_rcv(struct sk_buff *skb)
1191 {
1192         struct tcphdr *th;
1193         struct sock *sk;
1194         int ret;
1195
1196         if (skb->pkt_type != PACKET_HOST)
1197                 goto discard_it;
1198
1199         /* Count it even if it's bad */
1200         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1201
1202         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1203                 goto discard_it;
1204
1205         th = skb->h.th;
1206
1207         if (th->doff < sizeof(struct tcphdr) / 4)
1208                 goto bad_packet;
1209         if (!pskb_may_pull(skb, th->doff * 4))
1210                 goto discard_it;
1211
1212         /* An explanation is required here, I think.
1213          * Packet length and doff are validated by header prediction,
1214          * provided case of th->doff==0 is eliminated.
1215          * So, we defer the checks. */
1216         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1217              tcp_v4_checksum_init(skb)))
1218                 goto bad_packet;
1219
1220         th = skb->h.th;
1221         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1222         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1223                                     skb->len - th->doff * 4);
1224         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1225         TCP_SKB_CB(skb)->when    = 0;
1226         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1227         TCP_SKB_CB(skb)->sacked  = 0;
1228
1229         sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1230                            skb->nh.iph->daddr, ntohs(th->dest),
1231                            inet_iif(skb));
1232
1233         if (!sk)
1234                 goto no_tcp_socket;
1235
1236 process:
1237         if (sk->sk_state == TCP_TIME_WAIT)
1238                 goto do_time_wait;
1239
1240         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1241                 goto discard_and_relse;
1242
1243         if (sk_filter(sk, skb, 0))
1244                 goto discard_and_relse;
1245
1246         skb->dev = NULL;
1247
1248         bh_lock_sock(sk);
1249         ret = 0;
1250         if (!sock_owned_by_user(sk)) {
1251                 if (!tcp_prequeue(sk, skb))
1252                         ret = tcp_v4_do_rcv(sk, skb);
1253         } else
1254                 sk_add_backlog(sk, skb);
1255         bh_unlock_sock(sk);
1256
1257         sock_put(sk);
1258
1259         return ret;
1260
1261 no_tcp_socket:
1262         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1263                 goto discard_it;
1264
1265         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1266 bad_packet:
1267                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1268         } else {
1269                 tcp_v4_send_reset(skb);
1270         }
1271
1272 discard_it:
1273         /* Discard frame. */
1274         kfree_skb(skb);
1275         return 0;
1276
1277 discard_and_relse:
1278         sock_put(sk);
1279         goto discard_it;
1280
1281 do_time_wait:
1282         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1283                 inet_twsk_put((struct inet_timewait_sock *) sk);
1284                 goto discard_it;
1285         }
1286
1287         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1288                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1289                 inet_twsk_put((struct inet_timewait_sock *) sk);
1290                 goto discard_it;
1291         }
1292         switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1293                                            skb, th)) {
1294         case TCP_TW_SYN: {
1295                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1296                                                         skb->nh.iph->daddr,
1297                                                         ntohs(th->dest),
1298                                                         inet_iif(skb));
1299                 if (sk2) {
1300                         inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1301                                              &tcp_death_row);
1302                         inet_twsk_put((struct inet_timewait_sock *)sk);
1303                         sk = sk2;
1304                         goto process;
1305                 }
1306                 /* Fall through to ACK */
1307         }
1308         case TCP_TW_ACK:
1309                 tcp_v4_timewait_ack(sk, skb);
1310                 break;
1311         case TCP_TW_RST:
1312                 goto no_tcp_socket;
1313         case TCP_TW_SUCCESS:;
1314         }
1315         goto discard_it;
1316 }
1317
1318 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1319 {
1320         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1321         struct inet_sock *inet = inet_sk(sk);
1322
1323         sin->sin_family         = AF_INET;
1324         sin->sin_addr.s_addr    = inet->daddr;
1325         sin->sin_port           = inet->dport;
1326 }
1327
1328 /* VJ's idea. Save last timestamp seen from this destination
1329  * and hold it at least for normal timewait interval to use for duplicate
1330  * segment detection in subsequent connections, before they enter synchronized
1331  * state.
1332  */
1333
1334 int tcp_v4_remember_stamp(struct sock *sk)
1335 {
1336         struct inet_sock *inet = inet_sk(sk);
1337         struct tcp_sock *tp = tcp_sk(sk);
1338         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1339         struct inet_peer *peer = NULL;
1340         int release_it = 0;
1341
1342         if (!rt || rt->rt_dst != inet->daddr) {
1343                 peer = inet_getpeer(inet->daddr, 1);
1344                 release_it = 1;
1345         } else {
1346                 if (!rt->peer)
1347                         rt_bind_peer(rt, 1);
1348                 peer = rt->peer;
1349         }
1350
1351         if (peer) {
1352                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1353                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1354                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1355                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1356                         peer->tcp_ts = tp->rx_opt.ts_recent;
1357                 }
1358                 if (release_it)
1359                         inet_putpeer(peer);
1360                 return 1;
1361         }
1362
1363         return 0;
1364 }
1365
1366 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1367 {
1368         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1369
1370         if (peer) {
1371                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1372
1373                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1374                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1375                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1376                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1377                         peer->tcp_ts       = tcptw->tw_ts_recent;
1378                 }
1379                 inet_putpeer(peer);
1380                 return 1;
1381         }
1382
1383         return 0;
1384 }
1385
1386 struct tcp_func ipv4_specific = {
1387         .queue_xmit     =       ip_queue_xmit,
1388         .send_check     =       tcp_v4_send_check,
1389         .rebuild_header =       inet_sk_rebuild_header,
1390         .conn_request   =       tcp_v4_conn_request,
1391         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1392         .remember_stamp =       tcp_v4_remember_stamp,
1393         .net_header_len =       sizeof(struct iphdr),
1394         .setsockopt     =       ip_setsockopt,
1395         .getsockopt     =       ip_getsockopt,
1396         .addr2sockaddr  =       v4_addr2sockaddr,
1397         .sockaddr_len   =       sizeof(struct sockaddr_in),
1398 };
1399
1400 /* NOTE: A lot of things set to zero explicitly by call to
1401  *       sk_alloc() so need not be done here.
1402  */
1403 static int tcp_v4_init_sock(struct sock *sk)
1404 {
1405         struct inet_connection_sock *icsk = inet_csk(sk);
1406         struct tcp_sock *tp = tcp_sk(sk);
1407
1408         skb_queue_head_init(&tp->out_of_order_queue);
1409         tcp_init_xmit_timers(sk);
1410         tcp_prequeue_init(tp);
1411
1412         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1413         tp->mdev = TCP_TIMEOUT_INIT;
1414
1415         /* So many TCP implementations out there (incorrectly) count the
1416          * initial SYN frame in their delayed-ACK and congestion control
1417          * algorithms that we must have the following bandaid to talk
1418          * efficiently to them.  -DaveM
1419          */
1420         tp->snd_cwnd = 2;
1421
1422         /* See draft-stevens-tcpca-spec-01 for discussion of the
1423          * initialization of these values.
1424          */
1425         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1426         tp->snd_cwnd_clamp = ~0;
1427         tp->mss_cache = 536;
1428
1429         tp->reordering = sysctl_tcp_reordering;
1430         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1431
1432         sk->sk_state = TCP_CLOSE;
1433
1434         sk->sk_write_space = sk_stream_write_space;
1435         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1436
1437         tp->af_specific = &ipv4_specific;
1438
1439         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1440         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1441
1442         atomic_inc(&tcp_sockets_allocated);
1443
1444         return 0;
1445 }
1446
1447 int tcp_v4_destroy_sock(struct sock *sk)
1448 {
1449         struct tcp_sock *tp = tcp_sk(sk);
1450
1451         tcp_clear_xmit_timers(sk);
1452
1453         tcp_cleanup_congestion_control(sk);
1454
1455         /* Cleanup up the write buffer. */
1456         sk_stream_writequeue_purge(sk);
1457
1458         /* Cleans up our, hopefully empty, out_of_order_queue. */
1459         __skb_queue_purge(&tp->out_of_order_queue);
1460
1461         /* Clean prequeue, it must be empty really */
1462         __skb_queue_purge(&tp->ucopy.prequeue);
1463
1464         /* Clean up a referenced TCP bind bucket. */
1465         if (inet_csk(sk)->icsk_bind_hash)
1466                 inet_put_port(&tcp_hashinfo, sk);
1467
1468         /*
1469          * If sendmsg cached page exists, toss it.
1470          */
1471         if (sk->sk_sndmsg_page) {
1472                 __free_page(sk->sk_sndmsg_page);
1473                 sk->sk_sndmsg_page = NULL;
1474         }
1475
1476         atomic_dec(&tcp_sockets_allocated);
1477
1478         return 0;
1479 }
1480
1481 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1482
1483 #ifdef CONFIG_PROC_FS
1484 /* Proc filesystem TCP sock list dumping. */
1485
1486 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1487 {
1488         return hlist_empty(head) ? NULL :
1489                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1490 }
1491
1492 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1493 {
1494         return tw->tw_node.next ?
1495                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1496 }
1497
1498 static void *listening_get_next(struct seq_file *seq, void *cur)
1499 {
1500         struct inet_connection_sock *icsk;
1501         struct hlist_node *node;
1502         struct sock *sk = cur;
1503         struct tcp_iter_state* st = seq->private;
1504
1505         if (!sk) {
1506                 st->bucket = 0;
1507                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1508                 goto get_sk;
1509         }
1510
1511         ++st->num;
1512
1513         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1514                 struct request_sock *req = cur;
1515
1516                 icsk = inet_csk(st->syn_wait_sk);
1517                 req = req->dl_next;
1518                 while (1) {
1519                         while (req) {
1520                                 if (req->rsk_ops->family == st->family) {
1521                                         cur = req;
1522                                         goto out;
1523                                 }
1524                                 req = req->dl_next;
1525                         }
1526                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
1527                                 break;
1528 get_req:
1529                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1530                 }
1531                 sk        = sk_next(st->syn_wait_sk);
1532                 st->state = TCP_SEQ_STATE_LISTENING;
1533                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1534         } else {
1535                 icsk = inet_csk(sk);
1536                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1537                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1538                         goto start_req;
1539                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1540                 sk = sk_next(sk);
1541         }
1542 get_sk:
1543         sk_for_each_from(sk, node) {
1544                 if (sk->sk_family == st->family) {
1545                         cur = sk;
1546                         goto out;
1547                 }
1548                 icsk = inet_csk(sk);
1549                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1550                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1551 start_req:
1552                         st->uid         = sock_i_uid(sk);
1553                         st->syn_wait_sk = sk;
1554                         st->state       = TCP_SEQ_STATE_OPENREQ;
1555                         st->sbucket     = 0;
1556                         goto get_req;
1557                 }
1558                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1559         }
1560         if (++st->bucket < INET_LHTABLE_SIZE) {
1561                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1562                 goto get_sk;
1563         }
1564         cur = NULL;
1565 out:
1566         return cur;
1567 }
1568
1569 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1570 {
1571         void *rc = listening_get_next(seq, NULL);
1572
1573         while (rc && *pos) {
1574                 rc = listening_get_next(seq, rc);
1575                 --*pos;
1576         }
1577         return rc;
1578 }
1579
1580 static void *established_get_first(struct seq_file *seq)
1581 {
1582         struct tcp_iter_state* st = seq->private;
1583         void *rc = NULL;
1584
1585         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1586                 struct sock *sk;
1587                 struct hlist_node *node;
1588                 struct inet_timewait_sock *tw;
1589
1590                 /* We can reschedule _before_ having picked the target: */
1591                 cond_resched_softirq();
1592
1593                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1594                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1595                         if (sk->sk_family != st->family) {
1596                                 continue;
1597                         }
1598                         rc = sk;
1599                         goto out;
1600                 }
1601                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1602                 inet_twsk_for_each(tw, node,
1603                                    &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1604                         if (tw->tw_family != st->family) {
1605                                 continue;
1606                         }
1607                         rc = tw;
1608                         goto out;
1609                 }
1610                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1611                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1612         }
1613 out:
1614         return rc;
1615 }
1616
1617 static void *established_get_next(struct seq_file *seq, void *cur)
1618 {
1619         struct sock *sk = cur;
1620         struct inet_timewait_sock *tw;
1621         struct hlist_node *node;
1622         struct tcp_iter_state* st = seq->private;
1623
1624         ++st->num;
1625
1626         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1627                 tw = cur;
1628                 tw = tw_next(tw);
1629 get_tw:
1630                 while (tw && tw->tw_family != st->family) {
1631                         tw = tw_next(tw);
1632                 }
1633                 if (tw) {
1634                         cur = tw;
1635                         goto out;
1636                 }
1637                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1638                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1639
1640                 /* We can reschedule between buckets: */
1641                 cond_resched_softirq();
1642
1643                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1644                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1645                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1646                 } else {
1647                         cur = NULL;
1648                         goto out;
1649                 }
1650         } else
1651                 sk = sk_next(sk);
1652
1653         sk_for_each_from(sk, node) {
1654                 if (sk->sk_family == st->family)
1655                         goto found;
1656         }
1657
1658         st->state = TCP_SEQ_STATE_TIME_WAIT;
1659         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1660         goto get_tw;
1661 found:
1662         cur = sk;
1663 out:
1664         return cur;
1665 }
1666
1667 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1668 {
1669         void *rc = established_get_first(seq);
1670
1671         while (rc && pos) {
1672                 rc = established_get_next(seq, rc);
1673                 --pos;
1674         }
1675         return rc;
1676 }
1677
1678 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1679 {
1680         void *rc;
1681         struct tcp_iter_state* st = seq->private;
1682
1683         inet_listen_lock(&tcp_hashinfo);
1684         st->state = TCP_SEQ_STATE_LISTENING;
1685         rc        = listening_get_idx(seq, &pos);
1686
1687         if (!rc) {
1688                 inet_listen_unlock(&tcp_hashinfo);
1689                 local_bh_disable();
1690                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1691                 rc        = established_get_idx(seq, pos);
1692         }
1693
1694         return rc;
1695 }
1696
1697 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1698 {
1699         struct tcp_iter_state* st = seq->private;
1700         st->state = TCP_SEQ_STATE_LISTENING;
1701         st->num = 0;
1702         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1703 }
1704
1705 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1706 {
1707         void *rc = NULL;
1708         struct tcp_iter_state* st;
1709
1710         if (v == SEQ_START_TOKEN) {
1711                 rc = tcp_get_idx(seq, 0);
1712                 goto out;
1713         }
1714         st = seq->private;
1715
1716         switch (st->state) {
1717         case TCP_SEQ_STATE_OPENREQ:
1718         case TCP_SEQ_STATE_LISTENING:
1719                 rc = listening_get_next(seq, v);
1720                 if (!rc) {
1721                         inet_listen_unlock(&tcp_hashinfo);
1722                         local_bh_disable();
1723                         st->state = TCP_SEQ_STATE_ESTABLISHED;
1724                         rc        = established_get_first(seq);
1725                 }
1726                 break;
1727         case TCP_SEQ_STATE_ESTABLISHED:
1728         case TCP_SEQ_STATE_TIME_WAIT:
1729                 rc = established_get_next(seq, v);
1730                 break;
1731         }
1732 out:
1733         ++*pos;
1734         return rc;
1735 }
1736
1737 static void tcp_seq_stop(struct seq_file *seq, void *v)
1738 {
1739         struct tcp_iter_state* st = seq->private;
1740
1741         switch (st->state) {
1742         case TCP_SEQ_STATE_OPENREQ:
1743                 if (v) {
1744                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1745                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1746                 }
1747         case TCP_SEQ_STATE_LISTENING:
1748                 if (v != SEQ_START_TOKEN)
1749                         inet_listen_unlock(&tcp_hashinfo);
1750                 break;
1751         case TCP_SEQ_STATE_TIME_WAIT:
1752         case TCP_SEQ_STATE_ESTABLISHED:
1753                 if (v)
1754                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1755                 local_bh_enable();
1756                 break;
1757         }
1758 }
1759
1760 static int tcp_seq_open(struct inode *inode, struct file *file)
1761 {
1762         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1763         struct seq_file *seq;
1764         struct tcp_iter_state *s;
1765         int rc;
1766
1767         if (unlikely(afinfo == NULL))
1768                 return -EINVAL;
1769
1770         s = kmalloc(sizeof(*s), GFP_KERNEL);
1771         if (!s)
1772                 return -ENOMEM;
1773         memset(s, 0, sizeof(*s));
1774         s->family               = afinfo->family;
1775         s->seq_ops.start        = tcp_seq_start;
1776         s->seq_ops.next         = tcp_seq_next;
1777         s->seq_ops.show         = afinfo->seq_show;
1778         s->seq_ops.stop         = tcp_seq_stop;
1779
1780         rc = seq_open(file, &s->seq_ops);
1781         if (rc)
1782                 goto out_kfree;
1783         seq          = file->private_data;
1784         seq->private = s;
1785 out:
1786         return rc;
1787 out_kfree:
1788         kfree(s);
1789         goto out;
1790 }
1791
1792 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1793 {
1794         int rc = 0;
1795         struct proc_dir_entry *p;
1796
1797         if (!afinfo)
1798                 return -EINVAL;
1799         afinfo->seq_fops->owner         = afinfo->owner;
1800         afinfo->seq_fops->open          = tcp_seq_open;
1801         afinfo->seq_fops->read          = seq_read;
1802         afinfo->seq_fops->llseek        = seq_lseek;
1803         afinfo->seq_fops->release       = seq_release_private;
1804
1805         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1806         if (p)
1807                 p->data = afinfo;
1808         else
1809                 rc = -ENOMEM;
1810         return rc;
1811 }
1812
1813 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1814 {
1815         if (!afinfo)
1816                 return;
1817         proc_net_remove(afinfo->name);
1818         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1819 }
1820
1821 static void get_openreq4(struct sock *sk, struct request_sock *req,
1822                          char *tmpbuf, int i, int uid)
1823 {
1824         const struct inet_request_sock *ireq = inet_rsk(req);
1825         int ttd = req->expires - jiffies;
1826
1827         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1828                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1829                 i,
1830                 ireq->loc_addr,
1831                 ntohs(inet_sk(sk)->sport),
1832                 ireq->rmt_addr,
1833                 ntohs(ireq->rmt_port),
1834                 TCP_SYN_RECV,
1835                 0, 0, /* could print option size, but that is af dependent. */
1836                 1,    /* timers active (only the expire timer) */
1837                 jiffies_to_clock_t(ttd),
1838                 req->retrans,
1839                 uid,
1840                 0,  /* non standard timer */
1841                 0, /* open_requests have no inode */
1842                 atomic_read(&sk->sk_refcnt),
1843                 req);
1844 }
1845
1846 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1847 {
1848         int timer_active;
1849         unsigned long timer_expires;
1850         struct tcp_sock *tp = tcp_sk(sp);
1851         const struct inet_connection_sock *icsk = inet_csk(sp);
1852         struct inet_sock *inet = inet_sk(sp);
1853         unsigned int dest = inet->daddr;
1854         unsigned int src = inet->rcv_saddr;
1855         __u16 destp = ntohs(inet->dport);
1856         __u16 srcp = ntohs(inet->sport);
1857
1858         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1859                 timer_active    = 1;
1860                 timer_expires   = icsk->icsk_timeout;
1861         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1862                 timer_active    = 4;
1863                 timer_expires   = icsk->icsk_timeout;
1864         } else if (timer_pending(&sp->sk_timer)) {
1865                 timer_active    = 2;
1866                 timer_expires   = sp->sk_timer.expires;
1867         } else {
1868                 timer_active    = 0;
1869                 timer_expires = jiffies;
1870         }
1871
1872         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1873                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
1874                 i, src, srcp, dest, destp, sp->sk_state,
1875                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
1876                 timer_active,
1877                 jiffies_to_clock_t(timer_expires - jiffies),
1878                 icsk->icsk_retransmits,
1879                 sock_i_uid(sp),
1880                 icsk->icsk_probes_out,
1881                 sock_i_ino(sp),
1882                 atomic_read(&sp->sk_refcnt), sp,
1883                 icsk->icsk_rto,
1884                 icsk->icsk_ack.ato,
1885                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1886                 tp->snd_cwnd,
1887                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1888 }
1889
1890 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1891 {
1892         unsigned int dest, src;
1893         __u16 destp, srcp;
1894         int ttd = tw->tw_ttd - jiffies;
1895
1896         if (ttd < 0)
1897                 ttd = 0;
1898
1899         dest  = tw->tw_daddr;
1900         src   = tw->tw_rcv_saddr;
1901         destp = ntohs(tw->tw_dport);
1902         srcp  = ntohs(tw->tw_sport);
1903
1904         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1905                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1906                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1907                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1908                 atomic_read(&tw->tw_refcnt), tw);
1909 }
1910
1911 #define TMPSZ 150
1912
1913 static int tcp4_seq_show(struct seq_file *seq, void *v)
1914 {
1915         struct tcp_iter_state* st;
1916         char tmpbuf[TMPSZ + 1];
1917
1918         if (v == SEQ_START_TOKEN) {
1919                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
1920                            "  sl  local_address rem_address   st tx_queue "
1921                            "rx_queue tr tm->when retrnsmt   uid  timeout "
1922                            "inode");
1923                 goto out;
1924         }
1925         st = seq->private;
1926
1927         switch (st->state) {
1928         case TCP_SEQ_STATE_LISTENING:
1929         case TCP_SEQ_STATE_ESTABLISHED:
1930                 get_tcp4_sock(v, tmpbuf, st->num);
1931                 break;
1932         case TCP_SEQ_STATE_OPENREQ:
1933                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1934                 break;
1935         case TCP_SEQ_STATE_TIME_WAIT:
1936                 get_timewait4_sock(v, tmpbuf, st->num);
1937                 break;
1938         }
1939         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1940 out:
1941         return 0;
1942 }
1943
1944 static struct file_operations tcp4_seq_fops;
1945 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1946         .owner          = THIS_MODULE,
1947         .name           = "tcp",
1948         .family         = AF_INET,
1949         .seq_show       = tcp4_seq_show,
1950         .seq_fops       = &tcp4_seq_fops,
1951 };
1952
1953 int __init tcp4_proc_init(void)
1954 {
1955         return tcp_proc_register(&tcp4_seq_afinfo);
1956 }
1957
1958 void tcp4_proc_exit(void)
1959 {
1960         tcp_proc_unregister(&tcp4_seq_afinfo);
1961 }
1962 #endif /* CONFIG_PROC_FS */
1963
1964 struct proto tcp_prot = {
1965         .name                   = "TCP",
1966         .owner                  = THIS_MODULE,
1967         .close                  = tcp_close,
1968         .connect                = tcp_v4_connect,
1969         .disconnect             = tcp_disconnect,
1970         .accept                 = inet_csk_accept,
1971         .ioctl                  = tcp_ioctl,
1972         .init                   = tcp_v4_init_sock,
1973         .destroy                = tcp_v4_destroy_sock,
1974         .shutdown               = tcp_shutdown,
1975         .setsockopt             = tcp_setsockopt,
1976         .getsockopt             = tcp_getsockopt,
1977         .sendmsg                = tcp_sendmsg,
1978         .recvmsg                = tcp_recvmsg,
1979         .backlog_rcv            = tcp_v4_do_rcv,
1980         .hash                   = tcp_v4_hash,
1981         .unhash                 = tcp_unhash,
1982         .get_port               = tcp_v4_get_port,
1983         .enter_memory_pressure  = tcp_enter_memory_pressure,
1984         .sockets_allocated      = &tcp_sockets_allocated,
1985         .orphan_count           = &tcp_orphan_count,
1986         .memory_allocated       = &tcp_memory_allocated,
1987         .memory_pressure        = &tcp_memory_pressure,
1988         .sysctl_mem             = sysctl_tcp_mem,
1989         .sysctl_wmem            = sysctl_tcp_wmem,
1990         .sysctl_rmem            = sysctl_tcp_rmem,
1991         .max_header             = MAX_TCP_HEADER,
1992         .obj_size               = sizeof(struct tcp_sock),
1993         .twsk_obj_size          = sizeof(struct tcp_timewait_sock),
1994         .rsk_prot               = &tcp_request_sock_ops,
1995 };
1996
1997
1998
1999 void __init tcp_v4_init(struct net_proto_family *ops)
2000 {
2001         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2002         if (err < 0)
2003                 panic("Failed to create the TCP control socket.\n");
2004         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2005         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2006
2007         /* Unhash it so that IP input processing does not even
2008          * see it, we do not wish this socket to see incoming
2009          * packets.
2010          */
2011         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2012 }
2013
2014 EXPORT_SYMBOL(ipv4_specific);
2015 EXPORT_SYMBOL(inet_bind_bucket_create);
2016 EXPORT_SYMBOL(tcp_hashinfo);
2017 EXPORT_SYMBOL(tcp_prot);
2018 EXPORT_SYMBOL(tcp_unhash);
2019 EXPORT_SYMBOL(tcp_v4_conn_request);
2020 EXPORT_SYMBOL(tcp_v4_connect);
2021 EXPORT_SYMBOL(tcp_v4_do_rcv);
2022 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2023 EXPORT_SYMBOL(tcp_v4_send_check);
2024 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2025
2026 #ifdef CONFIG_PROC_FS
2027 EXPORT_SYMBOL(tcp_proc_register);
2028 EXPORT_SYMBOL(tcp_proc_unregister);
2029 #endif
2030 EXPORT_SYMBOL(sysctl_local_port_range);
2031 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2032 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2033